diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 20f75b7d3240..eef6358ec587 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -45,7 +45,6 @@ definitions: - type: flags name: xdp-rx-metadata - render-max: true entries: - name: timestamp @@ -55,6 +54,18 @@ definitions: name: hash doc: Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash(). + - + type: flags + name: xsk-flags + entries: + - + name: tx-timestamp + doc: + HW timestamping egress packets is supported by the driver. + - + name: tx-checksum + doc: + L3 checksum HW offload is supported by the driver. attribute-sets: - @@ -86,6 +97,11 @@ attribute-sets: See Documentation/networking/xdp-rx-metadata.rst for more details. type: u64 enum: xdp-rx-metadata + - + name: xsk-features + doc: Bitmask of enabled AF_XDP features. + type: u64 + enum: xsk-flags - name: page-pool attributes: @@ -209,6 +225,7 @@ operations: - xdp-features - xdp-zc-max-segs - xdp-rx-metadata-features + - xsk-features dump: reply: *dev-all - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index cb435c141794..d9424dcb19bf 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -124,6 +124,7 @@ Contents: xfrm_sync xfrm_sysctl xdp-rx-metadata + xsk-tx-metadata .. only:: subproject and html diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst index 205696780b78..e3e9420fd817 100644 --- a/Documentation/networking/xdp-rx-metadata.rst +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + =============== XDP RX Metadata =============== diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst new file mode 100644 index 000000000000..97ecfa480d00 --- /dev/null +++ b/Documentation/networking/xsk-tx-metadata.rst @@ -0,0 +1,79 @@ +================== +AF_XDP TX Metadata +================== + +This document describes how to enable offloads when transmitting packets +via :doc:`af_xdp`. Refer to :doc:`xdp-rx-metadata` on how to access similar +metadata on the receive side. + +General Design +============== + +The headroom for the metadata is reserved via ``tx_metadata_len`` in +``struct xdp_umem_reg``. The metadata length is therefore the same for +every socket that shares the same umem. The metadata layout is a fixed UAPI, +refer to ``union xsk_tx_metadata`` in ``include/uapi/linux/if_xdp.h``. +Thus, generally, the ``tx_metadata_len`` field above should contain +``sizeof(union xsk_tx_metadata)``. + +The headroom and the metadata itself should be located right before +``xdp_desc->addr`` in the umem frame. Within a frame, the metadata +layout is as follows:: + + tx_metadata_len + / \ + +-----------------+---------+----------------------------+ + | xsk_tx_metadata | padding | payload | + +-----------------+---------+----------------------------+ + ^ + | + xdp_desc->addr + +An AF_XDP application can request headrooms larger than ``sizeof(struct +xsk_tx_metadata)``. The kernel will ignore the padding (and will still +use ``xdp_desc->addr - tx_metadata_len`` to locate +the ``xsk_tx_metadata``). For the frames that shouldn't carry +any metadata (i.e., the ones that don't have ``XDP_TX_METADATA`` option), +the metadata area is ignored by the kernel as well. + +The flags field enables the particular offload: + +- ``XDP_TXMD_FLAGS_TIMESTAMP``: requests the device to put transmission + timestamp into ``tx_timestamp`` field of ``union xsk_tx_metadata``. +- ``XDP_TXMD_FLAGS_CHECKSUM``: requests the device to calculate L4 + checksum. ``csum_start`` specifies byte offset of where the checksumming + should start and ``csum_offset`` specifies byte offset where the + device should store the computed checksum. + +Besides the flags above, in order to trigger the offloads, the first +packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA`` +bit in the ``options`` field. Also note that in a multi-buffer packet +only the first chunk should carry the metadata. + +Software TX Checksum +==================== + +For development and testing purposes its possible to pass +``XDP_UMEM_TX_SW_CSUM`` flag to ``XDP_UMEM_REG`` UMEM registration call. +In this case, when running in ``XDK_COPY`` mode, the TX checksum +is calculated on the CPU. Do not enable this option in production because +it will negatively affect performance. + +Querying Device Capabilities +============================ + +Every devices exports its offloads capabilities via netlink netdev family. +Refer to ``xsk-flags`` features bitmask in +``Documentation/netlink/specs/netdev.yaml``. + +- ``tx-timestamp``: device supports ``XDP_TXMD_FLAGS_TIMESTAMP`` +- ``tx-checksum``: device supports ``XDP_TXMD_FLAGS_CHECKSUM`` + +See ``tools/net/ynl/samples/netdev.c`` on how to query this information. + +Example +======= + +See ``tools/testing/selftests/bpf/xdp_hw_metadata.c`` for an example +program that handles TX metadata. Also see https://github.com/fomichev/xskgen +for a more bare-bones example. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index b2a5da9739d2..43f027bf2da3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -484,10 +484,12 @@ struct mlx5e_xdp_info_fifo { struct mlx5e_xdpsq; struct mlx5e_xmit_data; +struct xsk_tx_metadata; typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *); typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *, struct mlx5e_xmit_data *, - int); + int, + struct xsk_tx_metadata *); struct mlx5e_xdpsq { /* data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 7decc81ed33a..e2e7d82cfca4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -103,7 +103,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, xdptxd->dma_addr = dma_addr; if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, - mlx5e_xmit_xdp_frame, sq, xdptxd, 0))) + mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL))) return false; /* xmit_mode == MLX5E_XDP_XMIT_MODE_FRAME */ @@ -145,7 +145,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, xdptxd->dma_addr = dma_addr; if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, - mlx5e_xmit_xdp_frame, sq, xdptxd, 0))) + mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL))) return false; /* xmit_mode == MLX5E_XDP_XMIT_MODE_PAGE */ @@ -261,6 +261,37 @@ const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = { .xmo_rx_hash = mlx5e_xdp_rx_hash, }; +struct mlx5e_xsk_tx_complete { + struct mlx5_cqe64 *cqe; + struct mlx5e_cq *cq; +}; + +static u64 mlx5e_xsk_fill_timestamp(void *_priv) +{ + struct mlx5e_xsk_tx_complete *priv = _priv; + u64 ts; + + ts = get_cqe_ts(priv->cqe); + + if (mlx5_is_real_time_rq(priv->cq->mdev) || mlx5_is_real_time_sq(priv->cq->mdev)) + return mlx5_real_time_cyc2time(&priv->cq->mdev->clock, ts); + + return mlx5_timecounter_cyc2time(&priv->cq->mdev->clock, ts); +} + +static void mlx5e_xsk_request_checksum(u16 csum_start, u16 csum_offset, void *priv) +{ + struct mlx5_wqe_eth_seg *eseg = priv; + + /* HW/FW is doing parsing, so offsets are largely ignored. */ + eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; +} + +const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops = { + .tmo_fill_timestamp = mlx5e_xsk_fill_timestamp, + .tmo_request_checksum = mlx5e_xsk_request_checksum, +}; + /* returns true if packet was consumed by xdp */ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct bpf_prog *prog, struct mlx5e_xdp_buff *mxbuf) @@ -398,11 +429,11 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq INDIRECT_CALLABLE_SCOPE bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, - int check_result); + int check_result, struct xsk_tx_metadata *meta); INDIRECT_CALLABLE_SCOPE bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, - int check_result) + int check_result, struct xsk_tx_metadata *meta) { struct mlx5e_tx_mpwqe *session = &sq->mpwqe; struct mlx5e_xdpsq_stats *stats = sq->stats; @@ -420,7 +451,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx */ if (unlikely(sq->mpwqe.wqe)) mlx5e_xdp_mpwqe_complete(sq); - return mlx5e_xmit_xdp_frame(sq, xdptxd, 0); + return mlx5e_xmit_xdp_frame(sq, xdptxd, 0, meta); } if (!xdptxd->len) { skb_frag_t *frag = &xdptxdf->sinfo->frags[0]; @@ -450,6 +481,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx * and it's safe to complete it at any time. */ mlx5e_xdp_mpwqe_session_start(sq); + xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, &session->wqe->eth); } mlx5e_xdp_mpwqe_add_dseg(sq, p, stats); @@ -480,7 +512,7 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq) INDIRECT_CALLABLE_SCOPE bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, - int check_result) + int check_result, struct xsk_tx_metadata *meta) { struct mlx5e_xmit_data_frags *xdptxdf = container_of(xdptxd, struct mlx5e_xmit_data_frags, xd); @@ -599,6 +631,8 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, sq->pc++; } + xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, eseg); + sq->doorbell_cseg = cseg; stats->xmit++; @@ -608,7 +642,9 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_wqe_info *wi, u32 *xsk_frames, - struct xdp_frame_bulk *bq) + struct xdp_frame_bulk *bq, + struct mlx5e_cq *cq, + struct mlx5_cqe64 *cqe) { struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; u16 i; @@ -668,10 +704,24 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, break; } - case MLX5E_XDP_XMIT_MODE_XSK: + case MLX5E_XDP_XMIT_MODE_XSK: { /* AF_XDP send */ + struct xsk_tx_metadata_compl *compl = NULL; + struct mlx5e_xsk_tx_complete priv = { + .cqe = cqe, + .cq = cq, + }; + + if (xp_tx_metadata_enabled(sq->xsk_pool)) { + xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); + compl = &xdpi.xsk_meta; + + xsk_tx_metadata_complete(compl, &mlx5e_xsk_tx_metadata_ops, &priv); + } + (*xsk_frames)++; break; + } default: WARN_ON_ONCE(true); } @@ -720,7 +770,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) sqcc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, cq, cqe); } while (!last_wqe); if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { @@ -767,7 +817,7 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) sq->cc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, NULL, NULL); } xdp_flush_frame_bulk(&bq); @@ -840,7 +890,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, } ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, - mlx5e_xmit_xdp_frame, sq, xdptxd, 0); + mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL); if (unlikely(!ret)) { int j; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index ecfe93a479da..e054db1e10f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -33,6 +33,7 @@ #define __MLX5_EN_XDP_H__ #include +#include #include "en.h" #include "en/txrx.h" @@ -82,7 +83,7 @@ enum mlx5e_xdp_xmit_mode { * num, page_1, page_2, ... , page_num. * * MLX5E_XDP_XMIT_MODE_XSK: - * none. + * frame.xsk_meta. */ #define MLX5E_XDP_FIFO_ENTRIES2DS_MAX_RATIO 4 @@ -97,6 +98,7 @@ union mlx5e_xdp_info { u8 num; struct page *page; } page; + struct xsk_tx_metadata_compl xsk_meta; }; struct mlx5e_xsk_param; @@ -112,13 +114,16 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); extern const struct xdp_metadata_ops mlx5e_xdp_metadata_ops; +extern const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops; INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, - int check_result)); + int check_result, + struct xsk_tx_metadata *meta)); INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, - int check_result)); + int check_result, + struct xsk_tx_metadata *meta)); INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq)); INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index 597f319d4770..a59199ed590d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -55,12 +55,16 @@ static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq, nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc); mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, *xdpi); + if (xp_tx_metadata_enabled(sq->xsk_pool)) + mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, + (union mlx5e_xdp_info) { .xsk_meta = {} }); sq->doorbell_cseg = &nopwqe->ctrl; } bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) { struct xsk_buff_pool *pool = sq->xsk_pool; + struct xsk_tx_metadata *meta = NULL; union mlx5e_xdp_info xdpi; bool work_done = true; bool flush = false; @@ -93,12 +97,13 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr); xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr); xdptxd.len = desc.len; + meta = xsk_buff_get_metadata(pool, desc.addr); xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len); ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, mlx5e_xmit_xdp_frame, sq, &xdptxd, - check_result); + check_result, meta); if (unlikely(!ret)) { if (sq->mpwqe.wqe) mlx5e_xdp_mpwqe_complete(sq); @@ -106,6 +111,16 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) mlx5e_xsk_tx_post_err(sq, &xdpi); } else { mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); + if (xp_tx_metadata_enabled(sq->xsk_pool)) { + struct xsk_tx_metadata_compl compl; + + xsk_tx_metadata_to_compl(meta, &compl); + XSK_TX_COMPL_FITS(void *); + + mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, + (union mlx5e_xdp_info) + { .xsk_meta = compl }); + } } flush = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index af5e0d3c8ee1..26a98cfb0a59 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5165,6 +5165,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->netdev_ops = &mlx5e_netdev_ops; netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops; + netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops; mlx5e_dcbnl_build_netdev(netdev); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index cd7a9768de5f..686c94c2e8a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -51,6 +51,7 @@ struct stmmac_tx_info { bool last_segment; bool is_jumbo; enum stmmac_txbuf_type buf_type; + struct xsk_tx_metadata_compl xsk_meta; }; #define STMMAC_TBS_AVAIL BIT(0) @@ -100,6 +101,17 @@ struct stmmac_xdp_buff { struct dma_desc *ndesc; }; +struct stmmac_metadata_request { + struct stmmac_priv *priv; + struct dma_desc *tx_desc; + bool *set_ic; +}; + +struct stmmac_xsk_tx_complete { + struct stmmac_priv *priv; + struct dma_desc *desc; +}; + struct stmmac_rx_queue { u32 rx_count_frames; u32 queue_index; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 8964fc8a955f..c2ac88aaffed 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2430,6 +2430,46 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) } } +static void stmmac_xsk_request_timestamp(void *_priv) +{ + struct stmmac_metadata_request *meta_req = _priv; + + stmmac_enable_tx_timestamp(meta_req->priv, meta_req->tx_desc); + *meta_req->set_ic = true; +} + +static u64 stmmac_xsk_fill_timestamp(void *_priv) +{ + struct stmmac_xsk_tx_complete *tx_compl = _priv; + struct stmmac_priv *priv = tx_compl->priv; + struct dma_desc *desc = tx_compl->desc; + bool found = false; + u64 ns = 0; + + if (!priv->hwts_tx_en) + return 0; + + /* check tx tstamp status */ + if (stmmac_get_tx_timestamp_status(priv, desc)) { + stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns); + found = true; + } else if (!stmmac_get_mac_tx_timestamp(priv, priv->hw, &ns)) { + found = true; + } + + if (found) { + ns -= priv->plat->cdc_error_adj; + return ns_to_ktime(ns); + } + + return 0; +} + +static const struct xsk_tx_metadata_ops stmmac_xsk_tx_metadata_ops = { + .tmo_request_timestamp = stmmac_xsk_request_timestamp, + .tmo_fill_timestamp = stmmac_xsk_fill_timestamp, +}; + static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) { struct netdev_queue *nq = netdev_get_tx_queue(priv->dev, queue); @@ -2449,6 +2489,8 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) budget = min(budget, stmmac_tx_avail(priv, queue)); while (budget-- > 0) { + struct stmmac_metadata_request meta_req; + struct xsk_tx_metadata *meta = NULL; dma_addr_t dma_addr; bool set_ic; @@ -2472,6 +2514,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) tx_desc = tx_q->dma_tx + entry; dma_addr = xsk_buff_raw_get_dma(pool, xdp_desc.addr); + meta = xsk_buff_get_metadata(pool, xdp_desc.addr); xsk_buff_raw_dma_sync_for_device(pool, dma_addr, xdp_desc.len); tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_XSK_TX; @@ -2499,6 +2542,11 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) else set_ic = false; + meta_req.priv = priv; + meta_req.tx_desc = tx_desc; + meta_req.set_ic = &set_ic; + xsk_tx_metadata_request(meta, &stmmac_xsk_tx_metadata_ops, + &meta_req); if (set_ic) { tx_q->tx_count_frames = 0; stmmac_set_tx_ic(priv, tx_desc); @@ -2511,6 +2559,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) stmmac_enable_dma_transmission(priv, priv->ioaddr); + xsk_tx_metadata_to_compl(meta, + &tx_q->tx_skbuff_dma[entry].xsk_meta); + tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size); entry = tx_q->cur_tx; } @@ -2620,8 +2671,18 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, } else { tx_packets++; } - if (skb) + if (skb) { stmmac_get_tx_hwtstamp(priv, p, skb); + } else { + struct stmmac_xsk_tx_complete tx_compl = { + .priv = priv, + .desc = p, + }; + + xsk_tx_metadata_complete(&tx_q->tx_skbuff_dma[entry].xsk_meta, + &stmmac_xsk_tx_metadata_ops, + &tx_compl); + } } if (likely(tx_q->tx_skbuff_dma[entry].buf && @@ -7464,6 +7525,7 @@ int stmmac_dvr_probe(struct device *device, ndev->netdev_ops = &stmmac_netdev_ops; ndev->xdp_metadata_ops = &stmmac_xdp_metadata_ops; + ndev->xsk_tx_metadata_ops = &stmmac_xsk_tx_metadata_ops; ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 258ba232e302..eb447b0a9423 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 998c7aaa98b8..c2d74bc112dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1865,6 +1865,7 @@ enum netdev_stat_type { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. + * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2128,6 +2129,7 @@ struct net_device { unsigned long long priv_flags; const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; + const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; int ifindex; unsigned short gflags; unsigned short hard_header_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..b370eb8d70f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -566,6 +566,15 @@ struct ubuf_info_msgzc { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); +/* Preserve some data across TX submission and completion. + * + * Note, this state is stored in the driver. Extending the layout + * might need some special care. + */ +struct xsk_tx_metadata_compl { + __u64 *tx_timestamp; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -578,7 +587,10 @@ struct skb_shared_info { /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; - struct skb_shared_hwtstamps hwtstamps; + union { + struct skb_shared_hwtstamps hwtstamps; + struct xsk_tx_metadata_compl xsk_meta; + }; unsigned int gso_type; u32 tskey; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f83128007fb0..3cb4dc9bd70e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; @@ -92,12 +93,105 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void) { } +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} + +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..81e02de3f453 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -165,6 +165,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + 0) + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + struct xsk_tx_metadata *meta; + + if (!pool->tx_metadata_len) + return NULL; + + meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ + + return meta; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -324,6 +348,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..8d48d37ab7c0 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -33,6 +33,7 @@ struct xdp_buff_xsk { }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -77,10 +78,12 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when @@ -233,4 +236,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 8d48863472b9..d31698410410 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -33,7 +33,13 @@ #define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ -#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) struct sockaddr_xdp { __u16 sxdp_family; @@ -76,6 +82,7 @@ struct xdp_umem_reg { __u32 chunk_size; __u32 headroom; __u32 flags; + __u32 tx_metadata_len; }; struct xdp_statistics { @@ -105,6 +112,41 @@ struct xdp_options { #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of struct + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; @@ -121,4 +163,7 @@ struct xdp_desc { */ #define XDP_PKT_CONTD (1 << 0) +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) + #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 2b37233e00c0..6244c0164976 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -48,9 +48,18 @@ enum netdev_xdp_act { enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, NETDEV_XDP_RX_METADATA_HASH = 2, +}; - /* private: */ - NETDEV_XDP_RX_METADATA_MASK = 3, +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, }; enum { @@ -59,6 +68,7 @@ enum { NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 405da1f9e724..8e7b6072e3f4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -339,6 +339,11 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; +static const char *btf_type_name(const struct btf *btf, u32 id) +{ + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); +} + static DEFINE_MUTEX(bpf_verifier_lock); static DEFINE_MUTEX(bpf_percpu_ma_lock); @@ -418,6 +423,22 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; } +static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) +{ + struct bpf_func_info *info; + + if (!env->prog->aux->func_info) + return ""; + + info = &env->prog->aux->func_info[subprog]; + return btf_type_name(env->prog->aux->btf, info->type_id); +} + +static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -587,11 +608,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); } -static const char *btf_type_name(const struct btf *btf, u32 id) -{ - return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); -} - static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -9269,13 +9285,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err == -EFAULT) return err; if (subprog_is_global(env, subprog)) { + const char *sub_name = subprog_name(env, subprog); + if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", subprog); + verbose(env, "Caller passes invalid args into func#%d ('%s')\n", + subprog, sub_name); return err; } - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); + /* mark global subprog for verifying after main prog */ + subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); /* All global functions return a 64-bit SCALAR_VALUE */ @@ -19859,8 +19880,11 @@ out: return ret; } -/* Verify all global functions in a BPF program one by one based on their BTF. - * All global functions must pass verification. Otherwise the whole program is rejected. +/* Lazily verify all global functions based on their BTF, if they are called + * from main BPF program or any of subprograms transitively. + * BPF global subprogs called from dead code are not validated. + * All callable global functions must pass verification. + * Otherwise the whole program is rejected. * Consider: * int bar(int); * int foo(int f) @@ -19879,25 +19903,50 @@ out: static int do_check_subprogs(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; - int i, ret; + struct bpf_func_info_aux *sub_aux; + int i, ret, new_cnt; if (!aux->func_info) return 0; + /* exception callback is presumed to be always called */ + if (env->exception_callback_subprog) + subprog_aux(env, env->exception_callback_subprog)->called = true; + +again: + new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + if (!subprog_is_global(env, i)) continue; + + sub_aux = subprog_aux(env, i); + if (!sub_aux->called || sub_aux->verified) + continue; + env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i, env->exception_callback_subprog == i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { - verbose(env, - "Func#%d is safe for any args that match its prototype\n", - i); + verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n", + i, subprog_name(env, i)); } + + /* We verified new global subprog, it might have called some + * more global subprogs that we haven't verified yet, so we + * need to do another pass over subprogs to verify those. + */ + sub_aux->verified = true; + new_cnt++; } + + /* We can't loop forever as we verify at least one global subprog on + * each pass. + */ + if (new_cnt) + goto again; + return 0; } @@ -20543,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = do_check_subprogs(env); - ret = ret ?: do_check_main(env); + ret = do_check_main(env); + ret = ret ?: do_check_subprogs(env); if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f0b8b7c29126..c284a4ad0315 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3033,6 +3033,7 @@ struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; + unsigned long ref_ctr_offset; u64 cookie; struct uprobe_consumer consumer; }; @@ -3041,6 +3042,7 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; + u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3082,9 +3084,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) kfree(umulti_link); } +static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); + u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); + u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); + u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); + u32 upath_size = info->uprobe_multi.path_size; + struct bpf_uprobe_multi_link *umulti_link; + u32 ucount = info->uprobe_multi.count; + int err = 0, i; + long left; + + if (!upath ^ !upath_size) + return -EINVAL; + + if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) + return -EINVAL; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + info->uprobe_multi.count = umulti_link->cnt; + info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + + if (upath) { + char *p, *buf; + + upath_size = min_t(u32, upath_size, PATH_MAX); + + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { + kfree(buf); + return PTR_ERR(p); + } + upath_size = buf + upath_size - p; + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; + } + + if (!uoffsets && !ucookies && !uref_ctr_offsets) + return 0; + + if (ucount < umulti_link->cnt) + err = -ENOSPC; + else + ucount = umulti_link->cnt; + + for (i = 0; i < ucount; i++) { + if (uoffsets && + put_user(umulti_link->uprobes[i].offset, uoffsets + i)) + return -EFAULT; + if (uref_ctr_offsets && + put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) + return -EFAULT; + if (ucookies && + put_user(umulti_link->uprobes[i].cookie, ucookies + i)) + return -EFAULT; + } + + return err; +} + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc = bpf_uprobe_multi_link_dealloc, + .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3172,7 +3244,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; - unsigned long *ref_ctr_offsets = NULL; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; @@ -3245,18 +3316,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!uprobes || !link) goto error_free; - if (uref_ctr_offsets) { - ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); - if (!ref_ctr_offsets) - goto error_free; - } - for (i = 0; i < cnt; i++) { if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } - if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } @@ -3280,6 +3345,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; + link->flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); @@ -3287,7 +3353,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr for (i = 0; i < cnt; i++) { err = uprobe_register_refctr(d_real_inode(link->path.dentry), uprobes[i].offset, - ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); @@ -3299,11 +3365,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error_free; - kvfree(ref_ctr_offsets); return bpf_link_settle(&link_primer); error_free: - kvfree(ref_ctr_offsets); kvfree(uprobes); kfree(link); if (task) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index c148f8d1e564..e380fdf756db 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -12199,7 +12199,6 @@ static struct bpf_test tests[] = { BPF_JMP32_IMM_ZEXT(JLE), BPF_JMP32_IMM_ZEXT(JSGT), BPF_JMP32_IMM_ZEXT(JSGE), - BPF_JMP32_IMM_ZEXT(JSGT), BPF_JMP32_IMM_ZEXT(JSLT), BPF_JMP32_IMM_ZEXT(JSLE), #undef BPF_JMP2_IMM_ZEXT @@ -12235,7 +12234,6 @@ static struct bpf_test tests[] = { BPF_JMP32_REG_ZEXT(JLE), BPF_JMP32_REG_ZEXT(JSGT), BPF_JMP32_REG_ZEXT(JSGE), - BPF_JMP32_REG_ZEXT(JSGT), BPF_JMP32_REG_ZEXT(JSLT), BPF_JMP32_REG_ZEXT(JSLE), #undef BPF_JMP2_REG_ZEXT diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c9fdcc5cdce1..711cf5d59816 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -542,7 +542,7 @@ struct bpf_fentry_test_t { int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile (""); + asm volatile ("": "+r"(arg)); return (long)arg; } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index fe61f85bcf33..10f2124e9e23 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -13,6 +14,7 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; @@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + } + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, - xdp_rx_meta, NETDEV_A_DEV_PAD)) { + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 06cead2b8e34..caa340134b0e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -148,6 +148,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } +#define XDP_UMEM_FLAGS_VALID ( \ + XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ + XDP_UMEM_TX_SW_CSUM | \ + 0) + static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; @@ -167,7 +172,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) + if (mr->flags & ~XDP_UMEM_FLAGS_VALID) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) @@ -199,6 +204,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; + if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) + return -EINVAL; + umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; @@ -207,6 +215,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; + umem->tx_metadata_len = mr->tx_metadata_len; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ae9f8cb611f6..281d49b4fca4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -571,6 +571,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb) static void xsk_destruct_skb(struct sk_buff *skb) { + struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; + + if (compl->tx_timestamp) { + /* sw completion timestamp, not a real one */ + *compl->tx_timestamp = ktime_get_tai_fast_ns(); + } + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } @@ -655,8 +662,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { + struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; + bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { @@ -687,6 +696,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, kfree_skb(skb); goto free_err; } + + first_frag = true; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; @@ -709,12 +720,45 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } + + if (first_frag && desc->options & XDP_TX_METADATA) { + if (unlikely(xs->pool->tx_metadata_len == 0)) { + err = -EINVAL; + goto free_err; + } + + meta = buffer - xs->pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { + err = -EINVAL; + goto free_err; + } + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > len)) { + err = -EINVAL; + goto free_err; + } + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(xs->pool->tx_sw_csum)) { + err = skb_checksum_help(skb); + if (err) + goto free_err; + } + } + } } skb->dev = dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_set_destructor_arg(skb); return skb; @@ -1283,6 +1327,14 @@ struct xdp_umem_reg_v1 { __u32 headroom; }; +struct xdp_umem_reg_v2 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; +}; + static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1326,8 +1378,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; - else if (optlen < sizeof(mr)) + else if (optlen < sizeof(struct xdp_umem_reg_v2)) mr_size = sizeof(struct xdp_umem_reg_v1); + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v2); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 49cb9f9a09be..4f6f538a5462 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -85,6 +85,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, XDP_PACKET_HEADROOM; pool->umem = umem; pool->addrs = umem->addrs; + pool->tx_metadata_len = umem->tx_metadata_len; + pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 13354a1e4280..6f2d1621c992 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -137,21 +137,23 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_unused_options_set(u32 options) { - return options & ~XDP_PKT_CONTD; + return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 offset = desc->addr & (pool->chunk_size - 1); + u64 addr = desc->addr - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; + u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; - if (offset + desc->len > pool->chunk_size) + if (offset + len > pool->chunk_size) return false; - if (desc->addr >= pool->addrs_cnt) + if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) @@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; - if (desc->len > pool->chunk_size) + if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index a1528cde81ab..cb46667a6b2e 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -294,6 +294,37 @@ show_kprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) jsonw_end_array(json_wtr); } +static __u64 *u64_to_arr(__u64 val) +{ + return (__u64 *) u64_to_ptr(val); +} + +static void +show_uprobe_multi_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + __u32 i; + + jsonw_bool_field(json_wtr, "retprobe", + info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN); + jsonw_string_field(json_wtr, "path", (char *) u64_to_ptr(info->uprobe_multi.path)); + jsonw_uint_field(json_wtr, "func_cnt", info->uprobe_multi.count); + jsonw_int_field(json_wtr, "pid", (int) info->uprobe_multi.pid); + jsonw_name(json_wtr, "funcs"); + jsonw_start_array(json_wtr); + + for (i = 0; i < info->uprobe_multi.count; i++) { + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "offset", + u64_to_arr(info->uprobe_multi.offsets)[i]); + jsonw_uint_field(json_wtr, "ref_ctr_offset", + u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i]); + jsonw_uint_field(json_wtr, "cookie", + u64_to_arr(info->uprobe_multi.cookies)[i]); + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); +} + static void show_perf_event_kprobe_json(struct bpf_link_info *info, json_writer_t *wtr) { @@ -465,6 +496,9 @@ static int show_link_close_json(int fd, struct bpf_link_info *info) case BPF_LINK_TYPE_KPROBE_MULTI: show_kprobe_multi_json(info, json_wtr); break; + case BPF_LINK_TYPE_UPROBE_MULTI: + show_uprobe_multi_json(info, json_wtr); + break; case BPF_LINK_TYPE_PERF_EVENT: switch (info->perf_event.type) { case BPF_PERF_EVENT_EVENT: @@ -674,6 +708,33 @@ static void show_kprobe_multi_plain(struct bpf_link_info *info) } } +static void show_uprobe_multi_plain(struct bpf_link_info *info) +{ + __u32 i; + + if (!info->uprobe_multi.count) + return; + + if (info->uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN) + printf("\n\turetprobe.multi "); + else + printf("\n\tuprobe.multi "); + + printf("path %s ", (char *) u64_to_ptr(info->uprobe_multi.path)); + printf("func_cnt %u ", info->uprobe_multi.count); + + if (info->uprobe_multi.pid) + printf("pid %d ", info->uprobe_multi.pid); + + printf("\n\t%-16s %-16s %-16s", "offset", "ref_ctr_offset", "cookies"); + for (i = 0; i < info->uprobe_multi.count; i++) { + printf("\n\t0x%-16llx 0x%-16llx 0x%-16llx", + u64_to_arr(info->uprobe_multi.offsets)[i], + u64_to_arr(info->uprobe_multi.ref_ctr_offsets)[i], + u64_to_arr(info->uprobe_multi.cookies)[i]); + } +} + static void show_perf_event_kprobe_plain(struct bpf_link_info *info) { const char *buf; @@ -807,6 +868,9 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) case BPF_LINK_TYPE_KPROBE_MULTI: show_kprobe_multi_plain(info); break; + case BPF_LINK_TYPE_UPROBE_MULTI: + show_uprobe_multi_plain(info); + break; case BPF_LINK_TYPE_PERF_EVENT: switch (info->perf_event.type) { case BPF_PERF_EVENT_EVENT: @@ -846,8 +910,10 @@ static int show_link_close_plain(int fd, struct bpf_link_info *info) static int do_show_link(int fd) { + __u64 *ref_ctr_offsets = NULL, *offsets = NULL, *cookies = NULL; struct bpf_link_info info; __u32 len = sizeof(info); + char path_buf[PATH_MAX]; __u64 *addrs = NULL; char buf[PATH_MAX]; int count; @@ -889,6 +955,39 @@ again: goto again; } } + if (info.type == BPF_LINK_TYPE_UPROBE_MULTI && + !info.uprobe_multi.offsets) { + count = info.uprobe_multi.count; + if (count) { + offsets = calloc(count, sizeof(__u64)); + if (!offsets) { + p_err("mem alloc failed"); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.offsets = ptr_to_u64(offsets); + ref_ctr_offsets = calloc(count, sizeof(__u64)); + if (!ref_ctr_offsets) { + p_err("mem alloc failed"); + free(offsets); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets); + cookies = calloc(count, sizeof(__u64)); + if (!cookies) { + p_err("mem alloc failed"); + free(cookies); + free(offsets); + close(fd); + return -ENOMEM; + } + info.uprobe_multi.cookies = ptr_to_u64(cookies); + info.uprobe_multi.path = ptr_to_u64(path_buf); + info.uprobe_multi.path_size = sizeof(path_buf); + goto again; + } + } if (info.type == BPF_LINK_TYPE_PERF_EVENT) { switch (info.perf_event.type) { case BPF_PERF_EVENT_TRACEPOINT: @@ -924,8 +1023,10 @@ again: else show_link_close_plain(fd, &info); - if (addrs) - free(addrs); + free(ref_ctr_offsets); + free(cookies); + free(offsets); + free(addrs); close(fd); return 0; } diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 7ec4f5671e7a..feb8e305804f 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -442,7 +442,7 @@ static void print_prog_header_json(struct bpf_prog_info *info, int fd) jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); } -static void print_prog_json(struct bpf_prog_info *info, int fd) +static void print_prog_json(struct bpf_prog_info *info, int fd, bool orphaned) { char *memlock; @@ -461,6 +461,7 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) jsonw_uint_field(json_wtr, "uid", info->created_by_uid); } + jsonw_bool_field(json_wtr, "orphaned", orphaned); jsonw_uint_field(json_wtr, "bytes_xlated", info->xlated_prog_len); if (info->jited_prog_len) { @@ -527,7 +528,7 @@ static void print_prog_header_plain(struct bpf_prog_info *info, int fd) printf("\n"); } -static void print_prog_plain(struct bpf_prog_info *info, int fd) +static void print_prog_plain(struct bpf_prog_info *info, int fd, bool orphaned) { char *memlock; @@ -554,6 +555,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) printf(" memlock %sB", memlock); free(memlock); + if (orphaned) + printf(" orphaned"); + if (info->nr_map_ids) show_prog_maps(fd, info->nr_map_ids); @@ -581,15 +585,15 @@ static int show_prog(int fd) int err; err = bpf_prog_get_info_by_fd(fd, &info, &len); - if (err) { + if (err && err != -ENODEV) { p_err("can't get prog info: %s", strerror(errno)); return -1; } if (json_output) - print_prog_json(&info, fd); + print_prog_json(&info, fd, err == -ENODEV); else - print_prog_plain(&info, fd); + print_prog_plain(&info, fd, err == -ENODEV); return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 73a47da885dc..638c606dfa74 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -26,14 +26,20 @@ */ #define XDP_USE_NEED_WAKEUP (1 << 3) /* By setting this option, userspace application indicates that it can - * handle multiple descriptors per packet thus enabling xsk core to split + * handle multiple descriptors per packet thus enabling AF_XDP to split * multi-buffer XDP frames into multiple Rx descriptors. Without this set - * such frames will be dropped by xsk. + * such frames will be dropped. */ -#define XDP_USE_SG (1 << 4) +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ -#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) struct sockaddr_xdp { __u16 sxdp_family; @@ -76,6 +82,7 @@ struct xdp_umem_reg { __u32 chunk_size; __u32 headroom; __u32 flags; + __u32 tx_metadata_len; }; struct xdp_statistics { @@ -105,6 +112,41 @@ struct xdp_options { #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of union xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of union + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; @@ -112,9 +154,16 @@ struct xdp_desc { __u32 options; }; -/* Flag indicating packet constitutes of multiple buffers*/ -#define XDP_PKT_CONTD (1 << 0) - /* UMEM descriptor is __u64 */ +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) + #endif /* _LINUX_IF_XDP_H */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 2b37233e00c0..6244c0164976 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -48,9 +48,18 @@ enum netdev_xdp_act { enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, NETDEV_XDP_RX_METADATA_HASH = 2, +}; - /* private: */ - NETDEV_XDP_RX_METADATA_MASK = 3, +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, }; enum { @@ -59,6 +68,7 @@ enum { NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c index 2a62bf411bb3..b02faec748a5 100644 --- a/tools/lib/bpf/elf.c +++ b/tools/lib/bpf/elf.c @@ -407,7 +407,8 @@ static int symbol_cmp(const void *a, const void *b) * size, that needs to be released by the caller. */ int elf_resolve_syms_offsets(const char *binary_path, int cnt, - const char **syms, unsigned long **poffsets) + const char **syms, unsigned long **poffsets, + int st_type) { int sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; int err = 0, i, cnt_done = 0; @@ -438,7 +439,7 @@ int elf_resolve_syms_offsets(const char *binary_path, int cnt, struct elf_sym_iter iter; struct elf_sym *sym; - err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC); + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], st_type); if (err == -ENOENT) continue; if (err) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e067be95da3c..ea9b8158c20d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11447,7 +11447,7 @@ bpf_program__attach_uprobe_multi(const struct bpf_program *prog, return libbpf_err_ptr(err); offsets = resolved_offsets; } else if (syms) { - err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets); + err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets, STT_FUNC); if (err < 0) return libbpf_err_ptr(err); offsets = resolved_offsets; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b52dc28dc8af..91c5aef7dae7 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -409,3 +409,6 @@ LIBBPF_1.3.0 { ring__size; ring_buffer__ring; } LIBBPF_1.2.0; + +LIBBPF_1.4.0 { +} LIBBPF_1.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index f0f08635adb0..b5d334754e5d 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -594,7 +594,8 @@ int elf_open(const char *binary_path, struct elf_fd *elf_fd); void elf_close(struct elf_fd *elf_fd); int elf_resolve_syms_offsets(const char *binary_path, int cnt, - const char **syms, unsigned long **poffsets); + const char **syms, unsigned long **poffsets, + int st_type); int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, unsigned long **poffsets, size_t *pcnt); diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index 290411ddb39e..e783a47da815 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 1 -#define LIBBPF_MINOR_VERSION 3 +#define LIBBPF_MINOR_VERSION 4 #endif /* __LIBBPF_VERSION_H */ diff --git a/tools/net/ynl/generated/netdev-user.c b/tools/net/ynl/generated/netdev-user.c index a7b7019d00f1..3b9dee94d4ce 100644 --- a/tools/net/ynl/generated/netdev-user.c +++ b/tools/net/ynl/generated/netdev-user.c @@ -63,6 +63,19 @@ const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value) return netdev_xdp_rx_metadata_strmap[value]; } +static const char * const netdev_xsk_flags_strmap[] = { + [0] = "tx-timestamp", + [1] = "tx-checksum", +}; + +const char *netdev_xsk_flags_str(enum netdev_xsk_flags value) +{ + value = ffs(value) - 1; + if (value < 0 || value >= (int)MNL_ARRAY_SIZE(netdev_xsk_flags_strmap)) + return NULL; + return netdev_xsk_flags_strmap[value]; +} + /* Policies */ struct ynl_policy_attr netdev_page_pool_info_policy[NETDEV_A_PAGE_POOL_MAX + 1] = { [NETDEV_A_PAGE_POOL_ID] = { .name = "id", .type = YNL_PT_UINT, }, @@ -80,6 +93,7 @@ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = { [NETDEV_A_DEV_XDP_FEATURES] = { .name = "xdp-features", .type = YNL_PT_U64, }, [NETDEV_A_DEV_XDP_ZC_MAX_SEGS] = { .name = "xdp-zc-max-segs", .type = YNL_PT_U32, }, [NETDEV_A_DEV_XDP_RX_METADATA_FEATURES] = { .name = "xdp-rx-metadata-features", .type = YNL_PT_U64, }, + [NETDEV_A_DEV_XSK_FEATURES] = { .name = "xsk-features", .type = YNL_PT_U64, }, }; struct ynl_policy_nest netdev_dev_nest = { @@ -209,6 +223,11 @@ int netdev_dev_get_rsp_parse(const struct nlmsghdr *nlh, void *data) return MNL_CB_ERROR; dst->_present.xdp_rx_metadata_features = 1; dst->xdp_rx_metadata_features = mnl_attr_get_u64(attr); + } else if (type == NETDEV_A_DEV_XSK_FEATURES) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.xsk_features = 1; + dst->xsk_features = mnl_attr_get_u64(attr); } } diff --git a/tools/net/ynl/generated/netdev-user.h b/tools/net/ynl/generated/netdev-user.h index 4093602c9b6c..cc3d80d1cf8c 100644 --- a/tools/net/ynl/generated/netdev-user.h +++ b/tools/net/ynl/generated/netdev-user.h @@ -19,6 +19,7 @@ extern const struct ynl_family ynl_netdev_family; const char *netdev_op_str(int op); const char *netdev_xdp_act_str(enum netdev_xdp_act value); const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value); +const char *netdev_xsk_flags_str(enum netdev_xsk_flags value); /* Common nested types */ struct netdev_page_pool_info { @@ -60,12 +61,14 @@ struct netdev_dev_get_rsp { __u32 xdp_features:1; __u32 xdp_zc_max_segs:1; __u32 xdp_rx_metadata_features:1; + __u32 xsk_features:1; } _present; __u32 ifindex; __u64 xdp_features; __u32 xdp_zc_max_segs; __u64 xdp_rx_metadata_features; + __u64 xsk_features; }; void netdev_dev_get_rsp_free(struct netdev_dev_get_rsp *rsp); diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index b828225daad0..591b90e21890 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -33,17 +33,23 @@ static void netdev_print_device(struct netdev_dev_get_rsp *d, unsigned int op) return; printf("xdp-features (%llx):", d->xdp_features); - for (int i = 0; d->xdp_features > 1U << i; i++) { + for (int i = 0; d->xdp_features >= 1U << i; i++) { if (d->xdp_features & (1U << i)) printf(" %s", netdev_xdp_act_str(1 << i)); } printf(" xdp-rx-metadata-features (%llx):", d->xdp_rx_metadata_features); - for (int i = 0; d->xdp_rx_metadata_features > 1U << i; i++) { + for (int i = 0; d->xdp_rx_metadata_features >= 1U << i; i++) { if (d->xdp_rx_metadata_features & (1U << i)) printf(" %s", netdev_xdp_rx_metadata_str(1 << i)); } + printf(" xsk-features (%llx):", d->xsk_features); + for (int i = 0; d->xsk_features >= 1U << i; i++) { + if (d->xsk_features & (1U << i)) + printf(" %s", netdev_xsk_flags_str(1 << i)); + } + printf(" xdp-zc-max-segs=%u", d->xdp_zc_max_segs); name = netdev_op_str(op); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9c27b67bc7b1..617ae55c3bb5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -18,7 +18,7 @@ else GENDIR := $(abspath ../../../../include/generated) endif GENHDR := $(GENDIR)/autoconf.h -HOSTPKG_CONFIG := pkg-config +PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config ifneq ($(wildcard $(GENHDR)),) GENFLAGS := -DHAVE_GENHDR @@ -29,13 +29,17 @@ SAN_CFLAGS ?= SAN_LDFLAGS ?= $(SAN_CFLAGS) RELEASE ?= OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) + +LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null) +LIBELF_LIBS := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) + CFLAGS += -g $(OPT_FLAGS) -rdynamic \ -Wall -Werror \ - $(GENFLAGS) $(SAN_CFLAGS) \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) LDFLAGS += $(SAN_LDFLAGS) -LDLIBS += -lelf -lz -lrt -lpthread +LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread ifneq ($(LLVM),) # Silence some warnings when compiled with clang @@ -219,9 +223,9 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c $(call msg,SIGN-FILE,,$@) - $(Q)$(CC) $(shell $(HOSTPKG_CONFIG) --cflags libcrypto 2> /dev/null) \ + $(Q)$(CC) $(shell $(PKG_CONFIG) --cflags libcrypto 2> /dev/null) \ $< -o $@ \ - $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto) + $(shell $(PKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto) $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index cb9b95702ac6..9af79c7a9b58 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking: .. code-block:: console - $ LDLIBS=-static vmtest.sh + $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh .. note:: Some distros may not support static linking. diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 34f1200a781b..94b9be24e39b 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -71,4 +71,47 @@ struct nstoken; */ struct nstoken *open_netns(const char *name); void close_netns(struct nstoken *token); + +static __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + + return (__u16)~csum; +} + +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + __u32 len, __u8 proto, + __wsum csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; + s += htons(proto + len); + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, + __wsum csum) +{ + __u64 s = csum; + int i; + + for (i = 0; i < 4; i++) + s += (__u32)saddr->s6_addr32[i]; + for (i = 0; i < 4; i++) + s += (__u32)daddr->s6_addr32[i]; + s += htons(proto + len); + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + #endif diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c index 97142a4db374..d4b1901f7879 100644 --- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c +++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c @@ -7,6 +7,7 @@ #include #include "trace_helpers.h" #include "test_fill_link_info.skel.h" +#include "bpf/libbpf_internal.h" #define TP_CAT "sched" #define TP_NAME "sched_switch" @@ -140,14 +141,14 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel, .retprobe = type == BPF_PERF_EVENT_KRETPROBE, ); ssize_t entry_offset = 0; + struct bpf_link *link; int link_fd, err; - skel->links.kprobe_run = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run, - KPROBE_FUNC, &opts); - if (!ASSERT_OK_PTR(skel->links.kprobe_run, "attach_kprobe")) + link = bpf_program__attach_kprobe_opts(skel->progs.kprobe_run, KPROBE_FUNC, &opts); + if (!ASSERT_OK_PTR(link, "attach_kprobe")) return; - link_fd = bpf_link__fd(skel->links.kprobe_run); + link_fd = bpf_link__fd(link); if (!invalid) { /* See also arch_adjust_kprobe_addr(). */ if (skel->kconfig->CONFIG_X86_KERNEL_IBT) @@ -157,39 +158,41 @@ static void test_kprobe_fill_link_info(struct test_fill_link_info *skel, } else { kprobe_fill_invalid_user_buffer(link_fd); } - bpf_link__detach(skel->links.kprobe_run); + bpf_link__destroy(link); } static void test_tp_fill_link_info(struct test_fill_link_info *skel) { + struct bpf_link *link; int link_fd, err; - skel->links.tp_run = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME); - if (!ASSERT_OK_PTR(skel->links.tp_run, "attach_tp")) + link = bpf_program__attach_tracepoint(skel->progs.tp_run, TP_CAT, TP_NAME); + if (!ASSERT_OK_PTR(link, "attach_tp")) return; - link_fd = bpf_link__fd(skel->links.tp_run); + link_fd = bpf_link__fd(link); err = verify_perf_link_info(link_fd, BPF_PERF_EVENT_TRACEPOINT, 0, 0, 0); ASSERT_OK(err, "verify_perf_link_info"); - bpf_link__detach(skel->links.tp_run); + bpf_link__destroy(link); } static void test_uprobe_fill_link_info(struct test_fill_link_info *skel, enum bpf_perf_event_type type) { + struct bpf_link *link; int link_fd, err; - skel->links.uprobe_run = bpf_program__attach_uprobe(skel->progs.uprobe_run, - type == BPF_PERF_EVENT_URETPROBE, - 0, /* self pid */ - UPROBE_FILE, uprobe_offset); - if (!ASSERT_OK_PTR(skel->links.uprobe_run, "attach_uprobe")) + link = bpf_program__attach_uprobe(skel->progs.uprobe_run, + type == BPF_PERF_EVENT_URETPROBE, + 0, /* self pid */ + UPROBE_FILE, uprobe_offset); + if (!ASSERT_OK_PTR(link, "attach_uprobe")) return; - link_fd = bpf_link__fd(skel->links.uprobe_run); + link_fd = bpf_link__fd(link); err = verify_perf_link_info(link_fd, type, 0, uprobe_offset, 0); ASSERT_OK(err, "verify_perf_link_info"); - bpf_link__detach(skel->links.uprobe_run); + bpf_link__destroy(link); } static int verify_kmulti_link_info(int fd, bool retprobe) @@ -278,24 +281,214 @@ static void test_kprobe_multi_fill_link_info(struct test_fill_link_info *skel, bool retprobe, bool invalid) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + struct bpf_link *link; int link_fd, err; opts.syms = kmulti_syms; opts.cnt = KMULTI_CNT; opts.retprobe = retprobe; - skel->links.kmulti_run = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run, - NULL, &opts); - if (!ASSERT_OK_PTR(skel->links.kmulti_run, "attach_kprobe_multi")) + link = bpf_program__attach_kprobe_multi_opts(skel->progs.kmulti_run, NULL, &opts); + if (!ASSERT_OK_PTR(link, "attach_kprobe_multi")) return; - link_fd = bpf_link__fd(skel->links.kmulti_run); + link_fd = bpf_link__fd(link); if (!invalid) { err = verify_kmulti_link_info(link_fd, retprobe); ASSERT_OK(err, "verify_kmulti_link_info"); } else { verify_kmulti_invalid_user_buffer(link_fd); } - bpf_link__detach(skel->links.kmulti_run); + bpf_link__destroy(link); +} + +#define SEC(name) __attribute__((section(name), used)) + +static short uprobe_link_info_sema_1 SEC(".probes"); +static short uprobe_link_info_sema_2 SEC(".probes"); +static short uprobe_link_info_sema_3 SEC(".probes"); + +noinline void uprobe_link_info_func_1(void) +{ + asm volatile (""); + uprobe_link_info_sema_1++; +} + +noinline void uprobe_link_info_func_2(void) +{ + asm volatile (""); + uprobe_link_info_sema_2++; +} + +noinline void uprobe_link_info_func_3(void) +{ + asm volatile (""); + uprobe_link_info_sema_3++; +} + +static int +verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets, + __u64 *cookies, __u64 *ref_ctr_offsets) +{ + char path[PATH_MAX], path_buf[PATH_MAX]; + struct bpf_link_info info; + __u32 len = sizeof(info); + __u64 ref_ctr_offsets_buf[3]; + __u64 offsets_buf[3]; + __u64 cookies_buf[3]; + int i, err, bit; + __u32 count = 0; + + memset(path, 0, sizeof(path)); + err = readlink("/proc/self/exe", path, sizeof(path)); + if (!ASSERT_NEQ(err, -1, "readlink")) + return -1; + + for (bit = 0; bit < 8; bit++) { + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path = ptr_to_u64(path_buf); + info.uprobe_multi.path_size = sizeof(path_buf); + info.uprobe_multi.count = count; + + if (bit & 0x1) + info.uprobe_multi.offsets = ptr_to_u64(offsets_buf); + if (bit & 0x2) + info.uprobe_multi.cookies = ptr_to_u64(cookies_buf); + if (bit & 0x4) + info.uprobe_multi.ref_ctr_offsets = ptr_to_u64(ref_ctr_offsets_buf); + + err = bpf_link_get_info_by_fd(fd, &info, &len); + if (!ASSERT_OK(err, "bpf_link_get_info_by_fd")) + return -1; + + if (!ASSERT_EQ(info.type, BPF_LINK_TYPE_UPROBE_MULTI, "info.type")) + return -1; + + ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid"); + ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count"); + ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN, + retprobe, "info.uprobe_multi.flags.retprobe"); + ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size"); + ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path"); + + for (i = 0; i < info.uprobe_multi.count; i++) { + if (info.uprobe_multi.offsets) + ASSERT_EQ(offsets_buf[i], offsets[i], "info.uprobe_multi.offsets"); + if (info.uprobe_multi.cookies) + ASSERT_EQ(cookies_buf[i], cookies[i], "info.uprobe_multi.cookies"); + if (info.uprobe_multi.ref_ctr_offsets) { + ASSERT_EQ(ref_ctr_offsets_buf[i], ref_ctr_offsets[i], + "info.uprobe_multi.ref_ctr_offsets"); + } + } + count = count ?: info.uprobe_multi.count; + } + + return 0; +} + +static void verify_umulti_invalid_user_buffer(int fd) +{ + struct bpf_link_info info; + __u32 len = sizeof(info); + __u64 buf[3]; + int err; + + /* upath_size defined, not path */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path_size = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EINVAL, "failed_upath_size"); + + /* path defined, but small */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path = ptr_to_u64(buf); + info.uprobe_multi.path_size = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_LT(err, 0, "failed_upath_small"); + + /* path has wrong pointer */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.path_size = PATH_MAX; + info.uprobe_multi.path = 123; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EFAULT, "failed_bad_path_ptr"); + + /* count zero, with offsets */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = ptr_to_u64(buf); + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EINVAL, "failed_count"); + + /* offsets not big enough */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = ptr_to_u64(buf); + info.uprobe_multi.count = 2; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -ENOSPC, "failed_small_count"); + + /* offsets has wrong pointer */ + memset(&info, 0, sizeof(info)); + info.uprobe_multi.offsets = 123; + info.uprobe_multi.count = 3; + err = bpf_link_get_info_by_fd(fd, &info, &len); + ASSERT_EQ(err, -EFAULT, "failed_wrong_offsets"); +} + +static void test_uprobe_multi_fill_link_info(struct test_fill_link_info *skel, + bool retprobe, bool invalid) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .retprobe = retprobe, + ); + const char *syms[3] = { + "uprobe_link_info_func_1", + "uprobe_link_info_func_2", + "uprobe_link_info_func_3", + }; + __u64 cookies[3] = { + 0xdead, + 0xbeef, + 0xcafe, + }; + const char *sema[3] = { + "uprobe_link_info_sema_1", + "uprobe_link_info_sema_2", + "uprobe_link_info_sema_3", + }; + __u64 *offsets = NULL, *ref_ctr_offsets; + struct bpf_link *link; + int link_fd, err; + + err = elf_resolve_syms_offsets("/proc/self/exe", 3, sema, + (unsigned long **) &ref_ctr_offsets, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_object")) + return; + + err = elf_resolve_syms_offsets("/proc/self/exe", 3, syms, + (unsigned long **) &offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_func")) + goto out; + + opts.syms = syms; + opts.cookies = &cookies[0]; + opts.ref_ctr_offsets = (unsigned long *) &ref_ctr_offsets[0]; + opts.cnt = ARRAY_SIZE(syms); + + link = bpf_program__attach_uprobe_multi(skel->progs.umulti_run, 0, + "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto out; + + link_fd = bpf_link__fd(link); + if (invalid) + verify_umulti_invalid_user_buffer(link_fd); + else + verify_umulti_link_info(link_fd, retprobe, offsets, cookies, ref_ctr_offsets); + + bpf_link__destroy(link); +out: + free(ref_ctr_offsets); + free(offsets); } void test_fill_link_info(void) @@ -337,6 +530,13 @@ void test_fill_link_info(void) if (test__start_subtest("kprobe_multi_invalid_ubuff")) test_kprobe_multi_fill_link_info(skel, true, true); + if (test__start_subtest("uprobe_multi_link_info")) + test_uprobe_multi_fill_link_info(skel, false, false); + if (test__start_subtest("uretprobe_multi_link_info")) + test_uprobe_multi_fill_link_info(skel, true, false); + if (test__start_subtest("uprobe_multi_invalid")) + test_uprobe_multi_fill_link_info(skel, false, true); + cleanup: test_fill_link_info__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index cd051d3901a9..ece260cf2c0b 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -249,7 +249,7 @@ static void __test_link_api(struct child *child) int link_extra_fd = -1; int err; - err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets); + err = elf_resolve_syms_offsets(path, 3, syms, (unsigned long **) &offsets, STT_FUNC); if (!ASSERT_OK(err, "elf_resolve_syms_offsets")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5cfa7a6316b6..8d746642cbd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -25,6 +25,7 @@ #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" #include "verifier_div_overflow.skel.h" +#include "verifier_global_subprogs.skel.h" #include "verifier_gotol.skel.h" #include "verifier_helper_access_var_len.skel.h" #include "verifier_helper_packet_access.skel.h" @@ -134,6 +135,7 @@ void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_acces void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } +void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_gotol(void) { RUN(verifier_gotol); } void test_verifier_helper_access_var_len(void) { RUN(verifier_helper_access_var_len); } void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_access); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 4439ba9392f8..33cdf88efa6b 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -56,7 +56,8 @@ static int open_xsk(int ifindex, struct xsk *xsk) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM, + .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx; u64 addr; @@ -138,6 +139,7 @@ static void ip_csum(struct iphdr *iph) static int generate_packet(struct xsk *xsk, __u16 dst_port) { + struct xsk_tx_metadata *meta; struct xdp_desc *tx_desc; struct udphdr *udph; struct ethhdr *eth; @@ -151,10 +153,14 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) return -1; tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx); - tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE; + tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata); printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr); data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr); + meta = data - sizeof(struct xsk_tx_metadata); + memset(meta, 0, sizeof(*meta)); + meta->flags = XDP_TXMD_FLAGS_TIMESTAMP; + eth = data; iph = (void *)(eth + 1); udph = (void *)(iph + 1); @@ -178,11 +184,17 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) udph->source = htons(AF_XDP_SOURCE_PORT); udph->dest = htons(dst_port); udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES); - udph->check = 0; + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES); + meta->flags |= XDP_TXMD_FLAGS_CHECKSUM; + meta->request.csum_start = sizeof(*eth) + sizeof(*iph); + meta->request.csum_offset = offsetof(struct udphdr, check); + tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES; + tx_desc->options |= XDP_TX_METADATA; xsk_ring_prod__submit(&xsk->tx, 1); ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0); @@ -194,13 +206,21 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port) static void complete_tx(struct xsk *xsk) { - __u32 idx; + struct xsk_tx_metadata *meta; __u64 addr; + void *data; + __u32 idx; if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) { addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx); printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr); + + data = xsk_umem__get_data(xsk->umem_area, addr); + meta = data - sizeof(struct xsk_tx_metadata); + + ASSERT_NEQ(meta->completion.tx_timestamp, 0, "tx_timestamp"); + xsk_ring_cons__release(&xsk->comp, 1); } } @@ -221,6 +241,7 @@ static int verify_xsk_metadata(struct xsk *xsk) const struct xdp_desc *rx_desc; struct pollfd fds = {}; struct xdp_meta *meta; + struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; __u64 comp_addr; @@ -257,6 +278,7 @@ static int verify_xsk_metadata(struct xsk *xsk) ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto"); iph = (void *)(eth + 1); ASSERT_EQ((int)iph->version, 4, "iph->version"); + udph = (void *)(iph + 1); /* custom metadata */ @@ -270,6 +292,9 @@ static int verify_xsk_metadata(struct xsk *xsk) ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type"); + /* checksum offload */ + ASSERT_EQ(udph->check, htons(0x721c), "csum"); + xsk_ring_cons__release(&xsk->rx, 1); refill_rx(xsk, comp_addr); diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c index 564f402d56fe..69509f8bb680 100644 --- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c +++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c @@ -39,4 +39,10 @@ int BPF_PROG(kmulti_run) return 0; } +SEC("uprobe.multi") +int BPF_PROG(umulti_run) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c index 7f159d83c6f6..6e03d42519a6 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func12.c +++ b/tools/testing/selftests/bpf/progs/test_global_func12.c @@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb) { const struct S s = {.x = skb->len }; - return foo(&s); + foo(&s); + + return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c index a32e11c7d933..5de44b09e8ec 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func17.c +++ b/tools/testing/selftests/bpf/progs/test_global_func17.c @@ -5,6 +5,7 @@ __noinline int foo(int *p) { + barrier_var(p); return p ? (*p = 42) : 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c new file mode 100644 index 000000000000..a0a5efd1caa1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +int arr[1]; +int unkn_idx; + +__noinline long global_bad(void) +{ + return arr[unkn_idx]; /* BOOM */ +} + +__noinline long global_good(void) +{ + return arr[0]; +} + +__noinline long global_calls_bad(void) +{ + return global_good() + global_bad() /* does BOOM indirectly */; +} + +__noinline long global_calls_good_only(void) +{ + return global_good(); +} + +SEC("?raw_tp") +__success __log_level(2) +/* main prog is validated completely first */ +__msg("('global_calls_good_only') is global and assumed valid.") +__msg("1: (95) exit") +/* eventually global_good() is transitively validated as well */ +__msg("Validating global_good() func") +__msg("('global_good') is safe for any args that match its prototype") +int chained_global_func_calls_success(void) +{ + return global_calls_good_only(); +} + +SEC("?raw_tp") +__failure __log_level(2) +/* main prog validated successfully first */ +__msg("1: (95) exit") +/* eventually we validate global_bad() and fail */ +__msg("Validating global_bad() func") +__msg("math between map_value pointer and register") /* BOOM */ +int chained_global_func_calls_bad(void) +{ + return global_calls_bad(); +} + +/* do out of bounds access forcing verifier to fail verification if this + * global func is called + */ +__noinline int global_unsupp(const int *mem) +{ + if (!mem) + return 0; + return mem[100]; /* BOOM */ +} + +const volatile bool skip_unsupp_global = true; + +SEC("?raw_tp") +__success +int guarded_unsupp_global_called(void) +{ + if (!skip_unsupp_global) + return global_unsupp(NULL); + return 0; +} + +SEC("?raw_tp") +__failure __log_level(2) +__msg("Func#1 ('global_unsupp') is global and assumed valid.") +__msg("Validating global_unsupp() func#1...") +__msg("value is outside of the allowed memory range") +int unguarded_unsupp_global_called(void) +{ + int x = 0; + + return global_unsupp(&x); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index f61d623b1ce8..b5efcaeaa1ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void) SEC("?raw_tp") __success __log_level(2) __msg("9: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 9 first_idx 6") +__msg("mark_precise: frame0: last_idx 9 first_idx 0") __msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8:") -__msg("mark_precise: frame0: last_idx 5 first_idx 0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6") diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 40cba8d368d9..6157f884d091 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -169,12 +169,14 @@ def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False): return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail, include_stderr=include_stderr) -def bpftool_prog_list(expected=None, ns=""): +def bpftool_prog_list(expected=None, ns="", exclude_orphaned=True): _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) # Remove the base progs for p in base_progs: if p in progs: progs.remove(p) + if exclude_orphaned: + progs = [ p for p in progs if not p['orphaned'] ] if expected is not None: if len(progs) != expected: fail(True, "%d BPF programs loaded, expected %d" % @@ -612,11 +614,9 @@ def pin_map(file_name, idx=0, expected=1): def check_dev_info_removed(prog_file=None, map_file=None): bpftool_prog_list(expected=0) + bpftool_prog_list(expected=1, exclude_orphaned=False) ret, err = bpftool("prog show pin %s" % (prog_file), fail=False) - fail(ret == 0, "Showing prog with removed device did not fail") - fail(err["error"].find("No such device") == -1, - "Showing prog with removed device expected ENODEV, error is %s" % - (err["error"])) + fail(ret != 0, "failed to show prog with removed device") bpftool_map_list(expected=0) ret, err = bpftool("map show pin %s" % (map_file), fail=False) @@ -1395,10 +1395,7 @@ try: start_test("Test multi-dev ASIC cross-dev destruction - orphaned...") ret, out = bpftool("prog show %s" % (progB), fail=False) - fail(ret == 0, "got information about orphaned program") - fail("error" not in out, "no error reported for get info on orphaned") - fail(out["error"] != "can't get prog info: No such device", - "wrong error for get info on orphaned") + fail(ret != 0, "couldn't get information about orphaned program") print("%s: OK" % (os.path.basename(__file__))) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index c3ba40d0b9de..3291625ba4fb 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -10,7 +10,9 @@ * - rx_hash * * TX: - * - TBD + * - UDP 9091 packets trigger TX reply + * - TX HW timestamp is requested and reported back upon completion + * - TX checksum is requested */ #include @@ -24,15 +26,18 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include "xdp_metadata.h" -#define UMEM_NUM 16 +#define UMEM_NUM 256 #define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE #define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM) #define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE) @@ -48,11 +53,14 @@ struct xsk { }; struct xdp_hw_metadata *bpf_obj; -__u16 bind_flags = XDP_COPY; +__u16 bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY; struct xsk *rx_xsk; const char *ifname; int ifindex; int rxq; +bool skip_tx; +__u64 last_hw_rx_timestamp; +__u64 last_xdp_rx_timestamp; void test__fail(void) { /* for network_helpers.c */ } @@ -68,7 +76,8 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, - .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG, + .flags = XSK_UMEM__DEFAULT_FLAGS, + .tx_metadata_len = sizeof(struct xsk_tx_metadata), }; __u32 idx; u64 addr; @@ -110,7 +119,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id) for (i = 0; i < UMEM_NUM / 2; i++) { addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE; printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr); - *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr; + *xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr; } xsk_ring_prod__submit(&xsk->fill, ret); @@ -131,12 +140,22 @@ static void refill_rx(struct xsk *xsk, __u64 addr) __u32 idx; if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) { - printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr); + printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr); *xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr; xsk_ring_prod__submit(&xsk->fill, 1); } } +static int kick_tx(struct xsk *xsk) +{ + return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0); +} + +static int kick_rx(struct xsk *xsk) +{ + return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL); +} + #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ static __u64 gettime(clockid_t clock_id) { @@ -152,6 +171,17 @@ static __u64 gettime(clockid_t clock_id) return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } +static void print_tstamp_delta(const char *name, const char *refname, + __u64 tstamp, __u64 reference) +{ + __s64 delta = (__s64)reference - (__s64)tstamp; + + printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n", + name, tstamp, (double)tstamp / NANOSEC_PER_SEC, refname, + (double)delta / NANOSEC_PER_SEC, + (double)delta / 1000); +} + static void verify_xdp_metadata(void *data, clockid_t clock_id) { struct xdp_meta *meta; @@ -164,25 +194,20 @@ static void verify_xdp_metadata(void *data, clockid_t clock_id) printf("rx_hash: 0x%X with RSS type:0x%X\n", meta->rx_hash, meta->rx_hash_type); - printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp, - (double)meta->rx_timestamp / NANOSEC_PER_SEC); if (meta->rx_timestamp) { - __u64 usr_clock = gettime(clock_id); - __u64 xdp_clock = meta->xdp_timestamp; - __s64 delta_X = xdp_clock - meta->rx_timestamp; - __s64 delta_X2U = usr_clock - xdp_clock; + __u64 ref_tstamp = gettime(clock_id); - printf("XDP RX-time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n", - xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC, - (double)delta_X / NANOSEC_PER_SEC, - (double)delta_X / 1000); + /* store received timestamps to calculate a delta at tx */ + last_hw_rx_timestamp = meta->rx_timestamp; + last_xdp_rx_timestamp = meta->xdp_timestamp; - printf("AF_XDP time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n", - usr_clock, (double)usr_clock / NANOSEC_PER_SEC, - (double)delta_X2U / NANOSEC_PER_SEC, - (double)delta_X2U / 1000); + print_tstamp_delta("HW RX-time", "User RX-time", + meta->rx_timestamp, ref_tstamp); + print_tstamp_delta("XDP RX-time", "User RX-time", + meta->xdp_timestamp, ref_tstamp); + } else { + printf("No rx_timestamp\n"); } - } static void verify_skb_metadata(int fd) @@ -230,6 +255,129 @@ static void verify_skb_metadata(int fd) printf("skb hwtstamp is not found!\n"); } +static bool complete_tx(struct xsk *xsk, clockid_t clock_id) +{ + struct xsk_tx_metadata *meta; + __u64 addr; + void *data; + __u32 idx; + + if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx)) + return false; + + addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx); + data = xsk_umem__get_data(xsk->umem_area, addr); + meta = data - sizeof(struct xsk_tx_metadata); + + printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr); + + if (meta->completion.tx_timestamp) { + __u64 ref_tstamp = gettime(clock_id); + + print_tstamp_delta("HW TX-complete-time", "User TX-complete-time", + meta->completion.tx_timestamp, ref_tstamp); + print_tstamp_delta("XDP RX-time", "User TX-complete-time", + last_xdp_rx_timestamp, ref_tstamp); + print_tstamp_delta("HW RX-time", "HW TX-complete-time", + last_hw_rx_timestamp, meta->completion.tx_timestamp); + } else { + printf("No tx_timestamp\n"); + } + + xsk_ring_cons__release(&xsk->comp, 1); + + return true; +} + +#define swap(a, b, len) do { \ + for (int i = 0; i < len; i++) { \ + __u8 tmp = ((__u8 *)a)[i]; \ + ((__u8 *)a)[i] = ((__u8 *)b)[i]; \ + ((__u8 *)b)[i] = tmp; \ + } \ +} while (0) + +static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id) +{ + struct xsk_tx_metadata *meta; + struct ipv6hdr *ip6h = NULL; + struct iphdr *iph = NULL; + struct xdp_desc *tx_desc; + struct udphdr *udph; + struct ethhdr *eth; + __sum16 want_csum; + void *data; + __u32 idx; + int ret; + int len; + + ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx); + if (ret != 1) { + printf("%p: failed to reserve tx slot\n", xsk); + return; + } + + tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx); + tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata); + data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr); + + meta = data - sizeof(struct xsk_tx_metadata); + memset(meta, 0, sizeof(*meta)); + meta->flags = XDP_TXMD_FLAGS_TIMESTAMP; + + eth = rx_packet; + + if (eth->h_proto == htons(ETH_P_IP)) { + iph = (void *)(eth + 1); + udph = (void *)(iph + 1); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + ip6h = (void *)(eth + 1); + udph = (void *)(ip6h + 1); + } else { + printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto); + xsk_ring_prod__cancel(&xsk->tx, 1); + return; + } + + len = ETH_HLEN; + if (ip6h) + len += sizeof(*ip6h) + ntohs(ip6h->payload_len); + if (iph) + len += ntohs(iph->tot_len); + + swap(eth->h_dest, eth->h_source, ETH_ALEN); + if (iph) + swap(&iph->saddr, &iph->daddr, 4); + else + swap(&ip6h->saddr, &ip6h->daddr, 16); + swap(&udph->source, &udph->dest, 2); + + want_csum = udph->check; + if (ip6h) + udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); + else + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + ntohs(udph->len), IPPROTO_UDP, 0); + + meta->flags |= XDP_TXMD_FLAGS_CHECKSUM; + if (iph) + meta->request.csum_start = sizeof(*eth) + sizeof(*iph); + else + meta->request.csum_start = sizeof(*eth) + sizeof(*ip6h); + meta->request.csum_offset = offsetof(struct udphdr, check); + + printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n", + xsk, ntohs(udph->check), ntohs(want_csum), + meta->request.csum_start, meta->request.csum_offset); + + memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */ + tx_desc->options |= XDP_TX_METADATA; + tx_desc->len = len; + + xsk_ring_prod__submit(&xsk->tx, 1); +} + static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id) { const struct xdp_desc *rx_desc; @@ -252,6 +400,13 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t while (true) { errno = 0; + + for (i = 0; i < rxq; i++) { + ret = kick_rx(&rx_xsk[i]); + if (ret) + printf("kick_rx ret=%d\n", ret); + } + ret = poll(fds, rxq + 1, 1000); printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n", ret, errno, bpf_obj->bss->pkts_skip, @@ -288,6 +443,22 @@ peek: verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr), clock_id); first_seg = false; + + if (!skip_tx) { + /* mirror first chunk back */ + ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr), + clock_id); + + ret = kick_tx(xsk); + if (ret) + printf("kick_tx ret=%d\n", ret); + + for (int j = 0; j < 500; j++) { + if (complete_tx(xsk, clock_id)) + break; + usleep(10*1000); + } + } } xsk_ring_cons__release(&xsk->rx, 1); @@ -420,8 +591,10 @@ static void print_usage(void) { const char *usage = "Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n" - " -m Enable multi-buffer XDP for larger MTU\n" + " -c Run in copy mode (zerocopy is default)\n" " -h Display this help and exit\n\n" + " -m Enable multi-buffer XDP for larger MTU\n" + " -r Don't generate AF_XDP reply (rx metadata only)\n" "Generate test packets on the other machine with:\n" " echo -n xdp | nc -u -q1 9091\n"; @@ -432,14 +605,22 @@ static void read_args(int argc, char *argv[]) { int opt; - while ((opt = getopt(argc, argv, "mh")) != -1) { + while ((opt = getopt(argc, argv, "chmr")) != -1) { switch (opt) { - case 'm': - bind_flags |= XDP_USE_SG; + case 'c': + bind_flags &= ~XDP_USE_NEED_WAKEUP; + bind_flags &= ~XDP_ZEROCOPY; + bind_flags |= XDP_COPY; break; case 'h': print_usage(); exit(0); + case 'm': + bind_flags |= XDP_USE_SG; + break; + case 'r': + skip_tx = true; + break; case '?': if (isprint(optopt)) fprintf(stderr, "Unknown option: -%c\n", optopt); diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c index e574711eeb84..25d568abf0f2 100644 --- a/tools/testing/selftests/bpf/xsk.c +++ b/tools/testing/selftests/bpf/xsk.c @@ -115,6 +115,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg, cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; cfg->flags = XSK_UMEM__DEFAULT_FLAGS; + cfg->tx_metadata_len = 0; return; } @@ -123,6 +124,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg, cfg->frame_size = usr_cfg->frame_size; cfg->frame_headroom = usr_cfg->frame_headroom; cfg->flags = usr_cfg->flags; + cfg->tx_metadata_len = usr_cfg->tx_metadata_len; } static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, @@ -252,6 +254,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, mr.chunk_size = umem->config.frame_size; mr.headroom = umem->config.frame_headroom; mr.flags = umem->config.flags; + mr.tx_metadata_len = umem->config.tx_metadata_len; err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); if (err) { diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h index 771570bc3731..93c2cc413cfc 100644 --- a/tools/testing/selftests/bpf/xsk.h +++ b/tools/testing/selftests/bpf/xsk.h @@ -200,6 +200,7 @@ struct xsk_umem_config { __u32 frame_size; __u32 frame_headroom; __u32 flags; + __u32 tx_metadata_len; }; int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags);