forked from Minki/linux
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-12-03 The main changes are: 1) Support BTF in kernel modules, from Andrii. 2) Introduce preferred busy-polling, from Björn. 3) bpf_ima_inode_hash() and bpf_bprm_opts_set() helpers, from KP Singh. 4) Memcg-based memory accounting for bpf objects, from Roman. 5) Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks, from Stanislav. * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (118 commits) selftests/bpf: Fix invalid use of strncat in test_sockmap libbpf: Use memcpy instead of strncpy to please GCC selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module selftests/bpf: Add tp_btf CO-RE reloc test for modules libbpf: Support attachment of BPF tracing programs to kernel modules libbpf: Factor out low-level BPF program loading helper bpf: Allow to specify kernel module BTFs when attaching BPF programs bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF selftests/bpf: Add support for marking sub-tests as skipped selftests/bpf: Add bpf_testmod kernel module for testing libbpf: Add kernel module BTF support for CO-RE relocations libbpf: Refactor CO-RE relocs to not assume a single BTF object libbpf: Add internal helper to load BTF data by FD bpf: Keep module's btf_data_size intact after load bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address() selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP bpf: Adds support for setting window clamp samples/bpf: Fix spelling mistake "recieving" -> "receiving" bpf: Fix cold build of test_progs-no_alu32 ... ==================== Link: https://lore.kernel.org/r/20201204021936.85653-1-alexei.starovoitov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
a1dd1d8697
@ -124,6 +124,9 @@
|
||||
|
||||
#define SO_DETACH_REUSEPORT_BPF 68
|
||||
|
||||
#define SO_PREFER_BUSY_POLL 69
|
||||
#define SO_BUSY_POLL_BUDGET 70
|
||||
|
||||
#if !defined(__KERNEL__)
|
||||
|
||||
#if __BITS_PER_LONG == 64
|
||||
|
@ -135,6 +135,9 @@
|
||||
|
||||
#define SO_DETACH_REUSEPORT_BPF 68
|
||||
|
||||
#define SO_PREFER_BUSY_POLL 69
|
||||
#define SO_BUSY_POLL_BUDGET 70
|
||||
|
||||
#if !defined(__KERNEL__)
|
||||
|
||||
#if __BITS_PER_LONG == 64
|
||||
|
@ -116,6 +116,9 @@
|
||||
|
||||
#define SO_DETACH_REUSEPORT_BPF 0x4042
|
||||
|
||||
#define SO_PREFER_BUSY_POLL 0x4043
|
||||
#define SO_BUSY_POLL_BUDGET 0x4044
|
||||
|
||||
#if !defined(__KERNEL__)
|
||||
|
||||
#if __BITS_PER_LONG == 64
|
||||
|
@ -117,6 +117,9 @@
|
||||
|
||||
#define SO_DETACH_REUSEPORT_BPF 0x0047
|
||||
|
||||
#define SO_PREFER_BUSY_POLL 0x0048
|
||||
#define SO_BUSY_POLL_BUDGET 0x0049
|
||||
|
||||
#if !defined(__KERNEL__)
|
||||
|
||||
|
||||
|
@ -416,7 +416,7 @@ static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid);
|
||||
rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0);
|
||||
|
||||
if (rc) {
|
||||
netif_err(rx_ring->adapter, ifup, rx_ring->netdev,
|
||||
|
@ -2884,7 +2884,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
|
||||
rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
|
@ -770,7 +770,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
|
||||
rq->caching = 1;
|
||||
|
||||
/* Driver have no proper error path for failed XDP RX-queue info reg */
|
||||
WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0);
|
||||
WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx, 0) < 0);
|
||||
|
||||
/* Send a mailbox msg to PF to config RQ */
|
||||
mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;
|
||||
|
@ -3334,7 +3334,7 @@ static int dpaa2_eth_setup_rx_flow(struct dpaa2_eth_priv *priv,
|
||||
return 0;
|
||||
|
||||
err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev,
|
||||
fq->flowid);
|
||||
fq->flowid, 0);
|
||||
if (err) {
|
||||
dev_err(dev, "xdp_rxq_info_reg failed\n");
|
||||
return err;
|
||||
|
@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
|
||||
i40e_clean_tx_ring(tx_ring);
|
||||
kfree(tx_ring->tx_bi);
|
||||
tx_ring->tx_bi = NULL;
|
||||
kfree(tx_ring->xsk_descs);
|
||||
tx_ring->xsk_descs = NULL;
|
||||
|
||||
if (tx_ring->desc) {
|
||||
dma_free_coherent(tx_ring->dev, tx_ring->size,
|
||||
@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
|
||||
if (!tx_ring->tx_bi)
|
||||
goto err;
|
||||
|
||||
if (ring_is_xdp(tx_ring)) {
|
||||
tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
|
||||
GFP_KERNEL);
|
||||
if (!tx_ring->xsk_descs)
|
||||
goto err;
|
||||
}
|
||||
|
||||
u64_stats_init(&tx_ring->syncp);
|
||||
|
||||
/* round up to nearest 4K */
|
||||
@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
|
||||
return 0;
|
||||
|
||||
err:
|
||||
kfree(tx_ring->xsk_descs);
|
||||
tx_ring->xsk_descs = NULL;
|
||||
kfree(tx_ring->tx_bi);
|
||||
tx_ring->tx_bi = NULL;
|
||||
return -ENOMEM;
|
||||
@ -1436,7 +1447,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
|
||||
/* XDP RX-queue info only needed for RX rings exposed to XDP */
|
||||
if (rx_ring->vsi->type == I40E_VSI_MAIN) {
|
||||
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
|
||||
rx_ring->queue_index);
|
||||
rx_ring->queue_index, rx_ring->q_vector->napi.napi_id);
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
@ -389,6 +389,7 @@ struct i40e_ring {
|
||||
struct i40e_channel *ch;
|
||||
struct xdp_rxq_info xdp_rxq;
|
||||
struct xsk_buff_pool *xsk_pool;
|
||||
struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */
|
||||
} ____cacheline_internodealigned_in_smp;
|
||||
|
||||
static inline bool ring_uses_build_skb(struct i40e_ring *ring)
|
||||
|
@ -2,6 +2,7 @@
|
||||
/* Copyright(c) 2018 Intel Corporation. */
|
||||
|
||||
#include <linux/bpf_trace.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <net/xdp_sock_drv.h>
|
||||
#include <net/xdp.h>
|
||||
|
||||
@ -380,6 +381,69 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
|
||||
return failure ? budget : (int)total_rx_packets;
|
||||
}
|
||||
|
||||
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
|
||||
unsigned int *total_bytes)
|
||||
{
|
||||
struct i40e_tx_desc *tx_desc;
|
||||
dma_addr_t dma;
|
||||
|
||||
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
|
||||
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
|
||||
|
||||
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
|
||||
tx_desc->buffer_addr = cpu_to_le64(dma);
|
||||
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
|
||||
0, desc->len, 0);
|
||||
|
||||
*total_bytes += desc->len;
|
||||
}
|
||||
|
||||
static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
|
||||
unsigned int *total_bytes)
|
||||
{
|
||||
u16 ntu = xdp_ring->next_to_use;
|
||||
struct i40e_tx_desc *tx_desc;
|
||||
dma_addr_t dma;
|
||||
u32 i;
|
||||
|
||||
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
|
||||
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
|
||||
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
|
||||
|
||||
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
|
||||
tx_desc->buffer_addr = cpu_to_le64(dma);
|
||||
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
|
||||
I40E_TX_DESC_CMD_EOP,
|
||||
0, desc[i].len, 0);
|
||||
|
||||
*total_bytes += desc[i].len;
|
||||
}
|
||||
|
||||
xdp_ring->next_to_use = ntu;
|
||||
}
|
||||
|
||||
static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
|
||||
unsigned int *total_bytes)
|
||||
{
|
||||
u32 batched, leftover, i;
|
||||
|
||||
batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
|
||||
leftover = nb_pkts & (PKTS_PER_BATCH - 1);
|
||||
for (i = 0; i < batched; i += PKTS_PER_BATCH)
|
||||
i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
|
||||
for (i = batched; i < batched + leftover; i++)
|
||||
i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
|
||||
}
|
||||
|
||||
static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
|
||||
{
|
||||
u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
|
||||
struct i40e_tx_desc *tx_desc;
|
||||
|
||||
tx_desc = I40E_TX_DESC(xdp_ring, ntu);
|
||||
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
|
||||
* @xdp_ring: XDP Tx ring
|
||||
@ -389,49 +453,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
|
||||
**/
|
||||
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
|
||||
{
|
||||
unsigned int sent_frames = 0, total_bytes = 0;
|
||||
struct i40e_tx_desc *tx_desc = NULL;
|
||||
struct i40e_tx_buffer *tx_bi;
|
||||
struct xdp_desc desc;
|
||||
dma_addr_t dma;
|
||||
struct xdp_desc *descs = xdp_ring->xsk_descs;
|
||||
u32 nb_pkts, nb_processed = 0;
|
||||
unsigned int total_bytes = 0;
|
||||
|
||||
while (budget-- > 0) {
|
||||
if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
|
||||
break;
|
||||
nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
|
||||
if (!nb_pkts)
|
||||
return false;
|
||||
|
||||
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
|
||||
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
|
||||
desc.len);
|
||||
|
||||
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
|
||||
tx_bi->bytecount = desc.len;
|
||||
|
||||
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
|
||||
tx_desc->buffer_addr = cpu_to_le64(dma);
|
||||
tx_desc->cmd_type_offset_bsz =
|
||||
build_ctob(I40E_TX_DESC_CMD_ICRC
|
||||
| I40E_TX_DESC_CMD_EOP,
|
||||
0, desc.len, 0);
|
||||
|
||||
sent_frames++;
|
||||
total_bytes += tx_bi->bytecount;
|
||||
|
||||
xdp_ring->next_to_use++;
|
||||
if (xdp_ring->next_to_use == xdp_ring->count)
|
||||
xdp_ring->next_to_use = 0;
|
||||
if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
|
||||
nb_processed = xdp_ring->count - xdp_ring->next_to_use;
|
||||
i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
|
||||
xdp_ring->next_to_use = 0;
|
||||
}
|
||||
|
||||
if (tx_desc) {
|
||||
/* Request an interrupt for the last frame and bump tail ptr. */
|
||||
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
|
||||
I40E_TXD_QW1_CMD_SHIFT);
|
||||
i40e_xdp_ring_update_tail(xdp_ring);
|
||||
i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
|
||||
&total_bytes);
|
||||
|
||||
xsk_tx_release(xdp_ring->xsk_pool);
|
||||
i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
|
||||
}
|
||||
/* Request an interrupt for the last frame and bump tail ptr. */
|
||||
i40e_set_rs_bit(xdp_ring);
|
||||
i40e_xdp_ring_update_tail(xdp_ring);
|
||||
|
||||
return !!budget;
|
||||
i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -4,6 +4,22 @@
|
||||
#ifndef _I40E_XSK_H_
|
||||
#define _I40E_XSK_H_
|
||||
|
||||
/* This value should match the pragma in the loop_unrolled_for
|
||||
* macro. Why 4? It is strictly empirical. It seems to be a good
|
||||
* compromise between the advantage of having simultaneous outstanding
|
||||
* reads to the DMA array that can hide each others latency and the
|
||||
* disadvantage of having a larger code path.
|
||||
*/
|
||||
#define PKTS_PER_BATCH 4
|
||||
|
||||
#ifdef __clang__
|
||||
#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for
|
||||
#elif __GNUC__ >= 8
|
||||
#define loop_unrolled_for _Pragma("GCC unroll 4") for
|
||||
#else
|
||||
#define loop_unrolled_for for
|
||||
#endif
|
||||
|
||||
struct i40e_vsi;
|
||||
struct xsk_buff_pool;
|
||||
struct zero_copy_allocator;
|
||||
|
@ -306,7 +306,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
|
||||
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
|
||||
/* coverity[check_return] */
|
||||
xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
|
||||
ring->q_index);
|
||||
ring->q_index, ring->q_vector->napi.napi_id);
|
||||
|
||||
ring->xsk_pool = ice_xsk_pool(ring);
|
||||
if (ring->xsk_pool) {
|
||||
@ -333,7 +333,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
|
||||
/* coverity[check_return] */
|
||||
xdp_rxq_info_reg(&ring->xdp_rxq,
|
||||
ring->netdev,
|
||||
ring->q_index);
|
||||
ring->q_index, ring->q_vector->napi.napi_id);
|
||||
|
||||
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
|
||||
MEM_TYPE_PAGE_SHARED,
|
||||
|
@ -483,7 +483,7 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
|
||||
if (rx_ring->vsi->type == ICE_VSI_PF &&
|
||||
!xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
|
||||
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
|
||||
rx_ring->q_index))
|
||||
rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
|
||||
goto err;
|
||||
return 0;
|
||||
|
||||
|
@ -4352,7 +4352,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
|
||||
|
||||
/* XDP RX-queue info */
|
||||
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
|
||||
rx_ring->queue_index) < 0)
|
||||
rx_ring->queue_index, 0) < 0)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
|
@ -6577,7 +6577,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
|
||||
|
||||
/* XDP RX-queue info */
|
||||
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
|
||||
rx_ring->queue_index) < 0)
|
||||
rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0)
|
||||
goto err;
|
||||
|
||||
rx_ring->xdp_prog = adapter->xdp_prog;
|
||||
|
@ -3493,7 +3493,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter,
|
||||
|
||||
/* XDP RX-queue info */
|
||||
if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
|
||||
rx_ring->queue_index) < 0)
|
||||
rx_ring->queue_index, 0) < 0)
|
||||
goto err;
|
||||
|
||||
rx_ring->xdp_prog = adapter->xdp_prog;
|
||||
|
@ -3243,7 +3243,7 @@ static int mvneta_create_page_pool(struct mvneta_port *pp,
|
||||
return err;
|
||||
}
|
||||
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id);
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0);
|
||||
if (err < 0)
|
||||
goto err_free_pp;
|
||||
|
||||
|
@ -2614,11 +2614,11 @@ static int mvpp2_rxq_init(struct mvpp2_port *port,
|
||||
mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size);
|
||||
|
||||
if (priv->percpu_pools) {
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id);
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id, 0);
|
||||
if (err < 0)
|
||||
goto err_free_dma;
|
||||
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id);
|
||||
err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id, 0);
|
||||
if (err < 0)
|
||||
goto err_unregister_rxq_short;
|
||||
|
||||
|
@ -283,7 +283,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
|
||||
ring->log_stride = ffs(ring->stride) - 1;
|
||||
ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
|
||||
|
||||
if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
|
||||
if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0)
|
||||
goto err_ring;
|
||||
|
||||
tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
|
||||
|
@ -434,7 +434,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
|
||||
rq_xdp_ix = rq->ix;
|
||||
if (xsk)
|
||||
rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK;
|
||||
err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix);
|
||||
err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0);
|
||||
if (err < 0)
|
||||
goto err_rq_xdp_prog;
|
||||
|
||||
|
@ -2533,7 +2533,7 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
|
||||
|
||||
if (dp->netdev) {
|
||||
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev,
|
||||
rx_ring->idx);
|
||||
rx_ring->idx, rx_ring->r_vec->napi.napi_id);
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
@ -1762,7 +1762,7 @@ static void qede_init_fp(struct qede_dev *edev)
|
||||
|
||||
/* Driver have no error path from here */
|
||||
WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
|
||||
fp->rxq->rxq_id) < 0);
|
||||
fp->rxq->rxq_id, 0) < 0);
|
||||
|
||||
if (xdp_rxq_info_reg_mem_model(&fp->rxq->xdp_rxq,
|
||||
MEM_TYPE_PAGE_ORDER0,
|
||||
|
@ -262,7 +262,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
|
||||
|
||||
/* Initialise XDP queue information */
|
||||
rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev,
|
||||
rx_queue->core_index);
|
||||
rx_queue->core_index, 0);
|
||||
|
||||
if (rc) {
|
||||
netif_err(efx, rx_err, efx->net_dev,
|
||||
|
@ -1314,7 +1314,7 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);
|
||||
err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0, priv->napi.napi_id);
|
||||
if (err)
|
||||
goto err_out;
|
||||
|
||||
|
@ -1186,7 +1186,7 @@ static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch)
|
||||
pool = cpsw->page_pool[ch];
|
||||
rxq = &priv->xdp_rxq[ch];
|
||||
|
||||
ret = xdp_rxq_info_reg(rxq, priv->ndev, ch);
|
||||
ret = xdp_rxq_info_reg(rxq, priv->ndev, ch, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -1499,7 +1499,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
|
||||
u64_stats_init(&nvchan->tx_stats.syncp);
|
||||
u64_stats_init(&nvchan->rx_stats.syncp);
|
||||
|
||||
ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i);
|
||||
ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0);
|
||||
|
||||
if (ret) {
|
||||
netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret);
|
||||
|
@ -780,7 +780,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
|
||||
} else {
|
||||
/* Setup XDP RX-queue info, for new tfile getting attached */
|
||||
err = xdp_rxq_info_reg(&tfile->xdp_rxq,
|
||||
tun->dev, tfile->queue_index);
|
||||
tun->dev, tfile->queue_index, 0);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
|
||||
|
@ -884,7 +884,6 @@ static int veth_napi_add(struct net_device *dev)
|
||||
for (i = 0; i < dev->real_num_rx_queues; i++) {
|
||||
struct veth_rq *rq = &priv->rq[i];
|
||||
|
||||
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
|
||||
napi_enable(&rq->xdp_napi);
|
||||
}
|
||||
|
||||
@ -926,7 +925,8 @@ static int veth_enable_xdp(struct net_device *dev)
|
||||
for (i = 0; i < dev->real_num_rx_queues; i++) {
|
||||
struct veth_rq *rq = &priv->rq[i];
|
||||
|
||||
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
|
||||
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
|
||||
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
|
||||
if (err < 0)
|
||||
goto err_rxq_reg;
|
||||
|
||||
@ -952,8 +952,12 @@ static int veth_enable_xdp(struct net_device *dev)
|
||||
err_reg_mem:
|
||||
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
|
||||
err_rxq_reg:
|
||||
for (i--; i >= 0; i--)
|
||||
xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
|
||||
for (i--; i >= 0; i--) {
|
||||
struct veth_rq *rq = &priv->rq[i];
|
||||
|
||||
xdp_rxq_info_unreg(&rq->xdp_rxq);
|
||||
netif_napi_del(&rq->xdp_napi);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
@ -1485,7 +1485,7 @@ static int virtnet_open(struct net_device *dev)
|
||||
if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
|
||||
schedule_delayed_work(&vi->refill, 0);
|
||||
|
||||
err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
|
||||
err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
|
@ -2014,7 +2014,7 @@ static int xennet_create_page_pool(struct netfront_queue *queue)
|
||||
}
|
||||
|
||||
err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev,
|
||||
queue->id);
|
||||
queue->id, 0);
|
||||
if (err) {
|
||||
netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n");
|
||||
goto err_free_pp;
|
||||
|
@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page)
|
||||
} while (bh != head);
|
||||
}
|
||||
/*
|
||||
* Lock out page->mem_cgroup migration to keep PageDirty
|
||||
* Lock out page's memcg migration to keep PageDirty
|
||||
* synchronized with per-memcg dirty page counters.
|
||||
*/
|
||||
lock_page_memcg(page);
|
||||
|
@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
|
||||
unsigned int napi_id = READ_ONCE(ep->napi_id);
|
||||
|
||||
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
|
||||
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
|
||||
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
|
||||
BUSY_POLL_BUDGET);
|
||||
}
|
||||
|
||||
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
|
||||
|
@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page)
|
||||
return !TestSetPageDirty(page);
|
||||
|
||||
/*
|
||||
* Lock out page->mem_cgroup migration to keep PageDirty
|
||||
* Lock out page's memcg migration to keep PageDirty
|
||||
* synchronized with per-memcg dirty page counters.
|
||||
*/
|
||||
lock_page_memcg(page);
|
||||
|
@ -246,11 +246,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
|
||||
__ret; \
|
||||
})
|
||||
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
|
||||
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \
|
||||
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL)
|
||||
|
||||
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
|
||||
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
|
||||
#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \
|
||||
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
|
||||
|
||||
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
|
||||
sk->sk_prot->pre_connect)
|
||||
@ -434,8 +434,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
|
||||
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
|
||||
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
|
||||
|
@ -20,6 +20,8 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
struct bpf_verifier_env;
|
||||
struct bpf_verifier_log;
|
||||
@ -37,6 +39,7 @@ struct bpf_iter_aux_info;
|
||||
struct bpf_local_storage;
|
||||
struct bpf_local_storage_map;
|
||||
struct kobject;
|
||||
struct mem_cgroup;
|
||||
|
||||
extern struct idr btf_idr;
|
||||
extern spinlock_t btf_idr_lock;
|
||||
@ -135,11 +138,6 @@ struct bpf_map_ops {
|
||||
const struct bpf_iter_seq_info *iter_seq_info;
|
||||
};
|
||||
|
||||
struct bpf_map_memory {
|
||||
u32 pages;
|
||||
struct user_struct *user;
|
||||
};
|
||||
|
||||
struct bpf_map {
|
||||
/* The first two cachelines with read-mostly members of which some
|
||||
* are also accessed in fast-path (e.g. ops, max_entries).
|
||||
@ -160,7 +158,9 @@ struct bpf_map {
|
||||
u32 btf_key_type_id;
|
||||
u32 btf_value_type_id;
|
||||
struct btf *btf;
|
||||
struct bpf_map_memory memory;
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
struct mem_cgroup *memcg;
|
||||
#endif
|
||||
char name[BPF_OBJ_NAME_LEN];
|
||||
u32 btf_vmlinux_value_type_id;
|
||||
bool bypass_spec_v1;
|
||||
@ -421,7 +421,10 @@ struct bpf_insn_access_aux {
|
||||
enum bpf_reg_type reg_type;
|
||||
union {
|
||||
int ctx_field_size;
|
||||
u32 btf_id;
|
||||
struct {
|
||||
struct btf *btf;
|
||||
u32 btf_id;
|
||||
};
|
||||
};
|
||||
struct bpf_verifier_log *log; /* for verbose logs */
|
||||
};
|
||||
@ -458,6 +461,7 @@ struct bpf_verifier_ops {
|
||||
struct bpf_insn *dst,
|
||||
struct bpf_prog *prog, u32 *target_size);
|
||||
int (*btf_struct_access)(struct bpf_verifier_log *log,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *t, int off, int size,
|
||||
enum bpf_access_type atype,
|
||||
u32 *next_btf_id);
|
||||
@ -771,6 +775,7 @@ struct bpf_prog_aux {
|
||||
u32 ctx_arg_info_size;
|
||||
u32 max_rdonly_access;
|
||||
u32 max_rdwr_access;
|
||||
struct btf *attach_btf;
|
||||
const struct bpf_ctx_arg_aux *ctx_arg_info;
|
||||
struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
|
||||
struct bpf_prog *dst_prog;
|
||||
@ -1005,7 +1010,6 @@ struct bpf_event_entry {
|
||||
|
||||
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
|
||||
int bpf_prog_calc_tag(struct bpf_prog *fp);
|
||||
const char *kernel_type_name(u32 btf_type_id);
|
||||
|
||||
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
|
||||
|
||||
@ -1202,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i);
|
||||
void bpf_prog_inc(struct bpf_prog *prog);
|
||||
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
|
||||
void bpf_prog_put(struct bpf_prog *prog);
|
||||
int __bpf_prog_charge(struct user_struct *user, u32 pages);
|
||||
void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
|
||||
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
|
||||
struct bpf_map **used_maps, u32 len);
|
||||
|
||||
@ -1218,12 +1220,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map);
|
||||
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
|
||||
void bpf_map_put_with_uref(struct bpf_map *map);
|
||||
void bpf_map_put(struct bpf_map *map);
|
||||
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
|
||||
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
|
||||
int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size);
|
||||
void bpf_map_charge_finish(struct bpf_map_memory *mem);
|
||||
void bpf_map_charge_move(struct bpf_map_memory *dst,
|
||||
struct bpf_map_memory *src);
|
||||
void *bpf_map_area_alloc(u64 size, int numa_node);
|
||||
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
|
||||
void bpf_map_area_free(void *base);
|
||||
@ -1240,6 +1236,34 @@ int generic_map_delete_batch(struct bpf_map *map,
|
||||
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
|
||||
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
int node);
|
||||
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
|
||||
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
|
||||
size_t align, gfp_t flags);
|
||||
#else
|
||||
static inline void *
|
||||
bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
int node)
|
||||
{
|
||||
return kmalloc_node(size, flags, node);
|
||||
}
|
||||
|
||||
static inline void *
|
||||
bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
|
||||
{
|
||||
return kzalloc(size, flags);
|
||||
}
|
||||
|
||||
static inline void __percpu *
|
||||
bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
|
||||
gfp_t flags)
|
||||
{
|
||||
return __alloc_percpu_gfp(size, align, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern int sysctl_unprivileged_bpf_disabled;
|
||||
|
||||
static inline bool bpf_allow_ptr_leaks(void)
|
||||
@ -1430,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
|
||||
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
const struct bpf_prog *prog,
|
||||
struct bpf_insn_access_aux *info);
|
||||
int btf_struct_access(struct bpf_verifier_log *log,
|
||||
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
|
||||
const struct btf_type *t, int off, int size,
|
||||
enum bpf_access_type atype,
|
||||
u32 *next_btf_id);
|
||||
bool btf_struct_ids_match(struct bpf_verifier_log *log,
|
||||
int off, u32 id, u32 need_type_id);
|
||||
const struct btf *btf, u32 id, int off,
|
||||
const struct btf *need_btf, u32 need_type_id);
|
||||
|
||||
int btf_distill_func_proto(struct bpf_verifier_log *log,
|
||||
struct btf *btf,
|
||||
@ -1490,15 +1515,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog)
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
|
||||
static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
|
||||
const struct bpf_link_ops *ops,
|
||||
struct bpf_prog *prog)
|
||||
@ -1842,6 +1858,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto;
|
||||
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
|
||||
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
|
||||
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
|
||||
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
|
||||
|
||||
const struct bpf_func_proto *bpf_tracing_func_proto(
|
||||
enum bpf_func_id func_id, const struct bpf_prog *prog);
|
||||
|
@ -5,6 +5,7 @@
|
||||
#define _LINUX_BPF_VERIFIER_H 1
|
||||
|
||||
#include <linux/bpf.h> /* for enum bpf_reg_type */
|
||||
#include <linux/btf.h> /* for struct btf and btf_id() */
|
||||
#include <linux/filter.h> /* for MAX_BPF_STACK */
|
||||
#include <linux/tnum.h>
|
||||
|
||||
@ -43,6 +44,8 @@ enum bpf_reg_liveness {
|
||||
struct bpf_reg_state {
|
||||
/* Ordering of fields matters. See states_equal() */
|
||||
enum bpf_reg_type type;
|
||||
/* Fixed part of pointer offset, pointer types only */
|
||||
s32 off;
|
||||
union {
|
||||
/* valid when type == PTR_TO_PACKET */
|
||||
int range;
|
||||
@ -52,15 +55,20 @@ struct bpf_reg_state {
|
||||
*/
|
||||
struct bpf_map *map_ptr;
|
||||
|
||||
u32 btf_id; /* for PTR_TO_BTF_ID */
|
||||
/* for PTR_TO_BTF_ID */
|
||||
struct {
|
||||
struct btf *btf;
|
||||
u32 btf_id;
|
||||
};
|
||||
|
||||
u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
|
||||
|
||||
/* Max size from any of the above. */
|
||||
unsigned long raw;
|
||||
struct {
|
||||
unsigned long raw1;
|
||||
unsigned long raw2;
|
||||
} raw;
|
||||
};
|
||||
/* Fixed part of pointer offset, pointer types only */
|
||||
s32 off;
|
||||
/* For PTR_TO_PACKET, used to find other pointers with the same variable
|
||||
* offset, so they can share range knowledge.
|
||||
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
|
||||
@ -311,7 +319,10 @@ struct bpf_insn_aux_data {
|
||||
struct {
|
||||
enum bpf_reg_type reg_type; /* type of pseudo_btf_id */
|
||||
union {
|
||||
u32 btf_id; /* btf_id for struct typed var */
|
||||
struct {
|
||||
struct btf *btf;
|
||||
u32 btf_id; /* btf_id for struct typed var */
|
||||
};
|
||||
u32 mem_size; /* mem_size for non-struct typed var */
|
||||
};
|
||||
} btf_var;
|
||||
@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env,
|
||||
|
||||
/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
|
||||
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
|
||||
u32 btf_id)
|
||||
struct btf *btf, u32 btf_id)
|
||||
{
|
||||
return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id;
|
||||
if (tgt_prog)
|
||||
return ((u64)tgt_prog->aux->id << 32) | btf_id;
|
||||
else
|
||||
return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
|
||||
}
|
||||
|
||||
int bpf_check_attach_target(struct bpf_verifier_log *log,
|
||||
|
@ -18,6 +18,7 @@ struct btf_show;
|
||||
|
||||
extern const struct file_operations btf_fops;
|
||||
|
||||
void btf_get(struct btf *btf);
|
||||
void btf_put(struct btf *btf);
|
||||
int btf_new_fd(const union bpf_attr *attr);
|
||||
struct btf *btf_get_by_fd(int fd);
|
||||
@ -88,7 +89,8 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
|
||||
char *buf, int len, u64 flags);
|
||||
|
||||
int btf_get_fd_by_id(u32 id);
|
||||
u32 btf_id(const struct btf *btf);
|
||||
u32 btf_obj_id(const struct btf *btf);
|
||||
bool btf_is_kernel(const struct btf *btf);
|
||||
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
|
||||
const struct btf_member *m,
|
||||
u32 expected_offset, u32 expected_size);
|
||||
@ -206,6 +208,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo(
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
struct bpf_prog;
|
||||
|
||||
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
|
||||
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
|
||||
struct btf *btf_parse_vmlinux(void);
|
||||
|
@ -29,6 +29,7 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
|
||||
enum kernel_read_file_id id);
|
||||
extern void ima_post_path_mknod(struct dentry *dentry);
|
||||
extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
|
||||
extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
|
||||
extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
|
||||
|
||||
#ifdef CONFIG_IMA_KEXEC
|
||||
@ -115,6 +116,11 @@ static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size)
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
|
||||
#endif /* CONFIG_IMA */
|
||||
|
||||
|
@ -343,6 +343,175 @@ struct mem_cgroup {
|
||||
|
||||
extern struct mem_cgroup *root_mem_cgroup;
|
||||
|
||||
enum page_memcg_data_flags {
|
||||
/* page->memcg_data is a pointer to an objcgs vector */
|
||||
MEMCG_DATA_OBJCGS = (1UL << 0),
|
||||
/* page has been accounted as a non-slab kernel page */
|
||||
MEMCG_DATA_KMEM = (1UL << 1),
|
||||
/* the next bit after the last actual flag */
|
||||
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
|
||||
};
|
||||
|
||||
#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
|
||||
|
||||
/*
|
||||
* page_memcg - get the memory cgroup associated with a page
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Returns a pointer to the memory cgroup associated with the page,
|
||||
* or NULL. This function assumes that the page is known to have a
|
||||
* proper memory cgroup pointer. It's not safe to call this function
|
||||
* against some type of pages, e.g. slab pages or ex-slab pages.
|
||||
*
|
||||
* Any of the following ensures page and memcg binding stability:
|
||||
* - the page lock
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
*/
|
||||
static inline struct mem_cgroup *page_memcg(struct page *page)
|
||||
{
|
||||
unsigned long memcg_data = page->memcg_data;
|
||||
|
||||
VM_BUG_ON_PAGE(PageSlab(page), page);
|
||||
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
|
||||
|
||||
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* page_memcg_rcu - locklessly get the memory cgroup associated with a page
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Returns a pointer to the memory cgroup associated with the page,
|
||||
* or NULL. This function assumes that the page is known to have a
|
||||
* proper memory cgroup pointer. It's not safe to call this function
|
||||
* against some type of pages, e.g. slab pages or ex-slab pages.
|
||||
*/
|
||||
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageSlab(page), page);
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
|
||||
return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
|
||||
~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* page_memcg_check - get the memory cgroup associated with a page
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Returns a pointer to the memory cgroup associated with the page,
|
||||
* or NULL. This function unlike page_memcg() can take any page
|
||||
* as an argument. It has to be used in cases when it's not known if a page
|
||||
* has an associated memory cgroup pointer or an object cgroups vector.
|
||||
*
|
||||
* Any of the following ensures page and memcg binding stability:
|
||||
* - the page lock
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
*/
|
||||
static inline struct mem_cgroup *page_memcg_check(struct page *page)
|
||||
{
|
||||
/*
|
||||
* Because page->memcg_data might be changed asynchronously
|
||||
* for slab pages, READ_ONCE() should be used here.
|
||||
*/
|
||||
unsigned long memcg_data = READ_ONCE(page->memcg_data);
|
||||
|
||||
if (memcg_data & MEMCG_DATA_OBJCGS)
|
||||
return NULL;
|
||||
|
||||
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* PageMemcgKmem - check if the page has MemcgKmem flag set
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Checks if the page has MemcgKmem flag set. The caller must ensure that
|
||||
* the page has an associated memory cgroup. It's not safe to call this function
|
||||
* against some types of pages, e.g. slab pages.
|
||||
*/
|
||||
static inline bool PageMemcgKmem(struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page);
|
||||
return page->memcg_data & MEMCG_DATA_KMEM;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
/*
|
||||
* page_objcgs - get the object cgroups vector associated with a page
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Returns a pointer to the object cgroups vector associated with the page,
|
||||
* or NULL. This function assumes that the page is known to have an
|
||||
* associated object cgroups vector. It's not safe to call this function
|
||||
* against pages, which might have an associated memory cgroup: e.g.
|
||||
* kernel stack pages.
|
||||
*/
|
||||
static inline struct obj_cgroup **page_objcgs(struct page *page)
|
||||
{
|
||||
unsigned long memcg_data = READ_ONCE(page->memcg_data);
|
||||
|
||||
VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), page);
|
||||
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
|
||||
|
||||
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* page_objcgs_check - get the object cgroups vector associated with a page
|
||||
* @page: a pointer to the page struct
|
||||
*
|
||||
* Returns a pointer to the object cgroups vector associated with the page,
|
||||
* or NULL. This function is safe to use if the page can be directly associated
|
||||
* with a memory cgroup.
|
||||
*/
|
||||
static inline struct obj_cgroup **page_objcgs_check(struct page *page)
|
||||
{
|
||||
unsigned long memcg_data = READ_ONCE(page->memcg_data);
|
||||
|
||||
if (!memcg_data || !(memcg_data & MEMCG_DATA_OBJCGS))
|
||||
return NULL;
|
||||
|
||||
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
|
||||
|
||||
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* set_page_objcgs - associate a page with a object cgroups vector
|
||||
* @page: a pointer to the page struct
|
||||
* @objcgs: a pointer to the object cgroups vector
|
||||
*
|
||||
* Atomically associates a page with a vector of object cgroups.
|
||||
*/
|
||||
static inline bool set_page_objcgs(struct page *page,
|
||||
struct obj_cgroup **objcgs)
|
||||
{
|
||||
return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
|
||||
MEMCG_DATA_OBJCGS);
|
||||
}
|
||||
#else
|
||||
static inline struct obj_cgroup **page_objcgs(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct obj_cgroup **page_objcgs_check(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool set_page_objcgs(struct page *page,
|
||||
struct obj_cgroup **objcgs)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static __always_inline bool memcg_stat_item_in_bytes(int idx)
|
||||
{
|
||||
if (idx == MEMCG_PERCPU_B)
|
||||
@ -743,15 +912,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
static inline void __mod_memcg_page_state(struct page *page,
|
||||
int idx, int val)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
__mod_memcg_state(page->mem_cgroup, idx, val);
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (memcg)
|
||||
__mod_memcg_state(memcg, idx, val);
|
||||
}
|
||||
|
||||
static inline void mod_memcg_page_state(struct page *page,
|
||||
int idx, int val)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
mod_memcg_state(page->mem_cgroup, idx, val);
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (memcg)
|
||||
mod_memcg_state(memcg, idx, val);
|
||||
}
|
||||
|
||||
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
|
||||
@ -834,16 +1007,17 @@ static inline void __mod_lruvec_page_state(struct page *page,
|
||||
enum node_stat_item idx, int val)
|
||||
{
|
||||
struct page *head = compound_head(page); /* rmap on tail pages */
|
||||
struct mem_cgroup *memcg = page_memcg(head);
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
struct lruvec *lruvec;
|
||||
|
||||
/* Untracked pages have no memcg, no lruvec. Update only the node */
|
||||
if (!head->mem_cgroup) {
|
||||
if (!memcg) {
|
||||
__mod_node_page_state(pgdat, idx, val);
|
||||
return;
|
||||
}
|
||||
|
||||
lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
|
||||
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
__mod_lruvec_state(lruvec, idx, val);
|
||||
}
|
||||
|
||||
@ -878,8 +1052,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
|
||||
static inline void count_memcg_page_event(struct page *page,
|
||||
enum vm_event_item idx)
|
||||
{
|
||||
if (page->mem_cgroup)
|
||||
count_memcg_events(page->mem_cgroup, idx, 1);
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (memcg)
|
||||
count_memcg_events(memcg, idx, 1);
|
||||
}
|
||||
|
||||
static inline void count_memcg_event_mm(struct mm_struct *mm,
|
||||
@ -948,6 +1124,27 @@ void mem_cgroup_split_huge_fixup(struct page *head);
|
||||
|
||||
struct mem_cgroup;
|
||||
|
||||
static inline struct mem_cgroup *page_memcg(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *page_memcg_check(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool PageMemcgKmem(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
|
||||
{
|
||||
return true;
|
||||
@ -1437,7 +1634,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
|
||||
if (unlikely(&page_memcg(page)->css != wb->memcg_css))
|
||||
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
|
||||
}
|
||||
|
||||
|
@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline struct mem_cgroup *page_memcg(struct page *page)
|
||||
{
|
||||
return page->mem_cgroup;
|
||||
}
|
||||
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
return READ_ONCE(page->mem_cgroup);
|
||||
}
|
||||
#else
|
||||
static inline struct mem_cgroup *page_memcg(struct page *page)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Some inline functions in vmstat.h depend on page_zone()
|
||||
*/
|
||||
|
@ -199,10 +199,7 @@ struct page {
|
||||
atomic_t _refcount;
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
union {
|
||||
struct mem_cgroup *mem_cgroup;
|
||||
struct obj_cgroup **obj_cgroups;
|
||||
};
|
||||
unsigned long memcg_data;
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -350,23 +350,25 @@ struct napi_struct {
|
||||
};
|
||||
|
||||
enum {
|
||||
NAPI_STATE_SCHED, /* Poll is scheduled */
|
||||
NAPI_STATE_MISSED, /* reschedule a napi */
|
||||
NAPI_STATE_DISABLE, /* Disable pending */
|
||||
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
|
||||
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
||||
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
||||
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
||||
NAPI_STATE_SCHED, /* Poll is scheduled */
|
||||
NAPI_STATE_MISSED, /* reschedule a napi */
|
||||
NAPI_STATE_DISABLE, /* Disable pending */
|
||||
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
|
||||
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
||||
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
|
||||
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
|
||||
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
|
||||
};
|
||||
|
||||
enum {
|
||||
NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
|
||||
NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
|
||||
NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
|
||||
NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
|
||||
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
||||
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||
NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
|
||||
NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
|
||||
NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
|
||||
NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
|
||||
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
||||
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
|
||||
};
|
||||
|
||||
enum gro_result {
|
||||
@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
|
||||
return test_bit(NAPI_STATE_DISABLE, &n->state);
|
||||
}
|
||||
|
||||
static inline bool napi_prefer_busy_poll(struct napi_struct *n)
|
||||
{
|
||||
return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
|
||||
}
|
||||
|
||||
bool napi_schedule_prep(struct napi_struct *n);
|
||||
|
||||
/**
|
||||
|
@ -715,9 +715,8 @@ PAGEFLAG_FALSE(DoubleMap)
|
||||
#define PAGE_MAPCOUNT_RESERVE -128
|
||||
#define PG_buddy 0x00000080
|
||||
#define PG_offline 0x00000100
|
||||
#define PG_kmemcg 0x00000200
|
||||
#define PG_table 0x00000400
|
||||
#define PG_guard 0x00000800
|
||||
#define PG_table 0x00000200
|
||||
#define PG_guard 0x00000400
|
||||
|
||||
#define PageType(page, flag) \
|
||||
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
|
||||
@ -768,12 +767,6 @@ PAGE_TYPE_OPS(Buddy, buddy)
|
||||
*/
|
||||
PAGE_TYPE_OPS(Offline, offline)
|
||||
|
||||
/*
|
||||
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
|
||||
* pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
|
||||
*/
|
||||
PAGE_TYPE_OPS(Kmemcg, kmemcg)
|
||||
|
||||
/*
|
||||
* Marks pages in use as page tables.
|
||||
*/
|
||||
|
@ -23,6 +23,8 @@
|
||||
*/
|
||||
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
|
||||
|
||||
#define BUSY_POLL_BUDGET 8
|
||||
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
|
||||
struct napi_struct;
|
||||
@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
|
||||
|
||||
void napi_busy_loop(unsigned int napi_id,
|
||||
bool (*loop_end)(void *, unsigned long),
|
||||
void *loop_end_arg);
|
||||
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
|
||||
|
||||
#else /* CONFIG_NET_RX_BUSY_POLL */
|
||||
static inline unsigned long net_busy_loop_on(void)
|
||||
@ -105,7 +107,9 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
|
||||
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
|
||||
|
||||
if (napi_id >= MIN_NAPI_ID)
|
||||
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
|
||||
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
|
||||
READ_ONCE(sk->sk_prefer_busy_poll),
|
||||
READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -131,13 +135,28 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
|
||||
sk_rx_queue_set(sk, skb);
|
||||
}
|
||||
|
||||
static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
|
||||
{
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
if (!READ_ONCE(sk->sk_napi_id))
|
||||
WRITE_ONCE(sk->sk_napi_id, napi_id);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* variant used for unconnected sockets */
|
||||
static inline void sk_mark_napi_id_once(struct sock *sk,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
if (!READ_ONCE(sk->sk_napi_id))
|
||||
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
|
||||
__sk_mark_napi_id_once(sk, skb->napi_id);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void sk_mark_napi_id_once_xdp(struct sock *sk,
|
||||
const struct xdp_buff *xdp)
|
||||
{
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
__sk_mark_napi_id_once(sk, xdp->rxq->napi_id);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -301,6 +301,8 @@ struct bpf_local_storage;
|
||||
* @sk_ack_backlog: current listen backlog
|
||||
* @sk_max_ack_backlog: listen backlog set in listen()
|
||||
* @sk_uid: user id of owner
|
||||
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
|
||||
* @sk_busy_poll_budget: napi processing budget when busypolling
|
||||
* @sk_priority: %SO_PRIORITY setting
|
||||
* @sk_type: socket type (%SOCK_STREAM, etc)
|
||||
* @sk_protocol: which protocol this socket belongs in this network family
|
||||
@ -479,6 +481,10 @@ struct sock {
|
||||
u32 sk_ack_backlog;
|
||||
u32 sk_max_ack_backlog;
|
||||
kuid_t sk_uid;
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
u8 sk_prefer_busy_poll;
|
||||
u16 sk_busy_poll_budget;
|
||||
#endif
|
||||
struct pid *sk_peer_pid;
|
||||
const struct cred *sk_peer_cred;
|
||||
long sk_rcvtimeo;
|
||||
|
@ -410,6 +410,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req);
|
||||
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
|
||||
int flags, int *addr_len);
|
||||
int tcp_set_rcvlowat(struct sock *sk, int val);
|
||||
int tcp_set_window_clamp(struct sock *sk, int val);
|
||||
void tcp_data_ready(struct sock *sk);
|
||||
#ifdef CONFIG_MMU
|
||||
int tcp_mmap(struct file *file, struct socket *sock,
|
||||
|
@ -59,6 +59,7 @@ struct xdp_rxq_info {
|
||||
u32 queue_index;
|
||||
u32 reg_state;
|
||||
struct xdp_mem_info mem;
|
||||
unsigned int napi_id;
|
||||
} ____cacheline_aligned; /* perf critical, avoid false-sharing */
|
||||
|
||||
struct xdp_txq_info {
|
||||
@ -226,7 +227,7 @@ static inline void xdp_release_frame(struct xdp_frame *xdpf)
|
||||
}
|
||||
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
struct net_device *dev, u32 queue_index);
|
||||
struct net_device *dev, u32 queue_index, unsigned int napi_id);
|
||||
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
|
||||
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
|
||||
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
|
||||
|
@ -13,6 +13,7 @@
|
||||
|
||||
void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
|
||||
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
|
||||
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max);
|
||||
void xsk_tx_release(struct xsk_buff_pool *pool);
|
||||
struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
|
||||
u16 queue_id);
|
||||
@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc,
|
||||
u32 max)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void xsk_tx_release(struct xsk_buff_pool *pool)
|
||||
{
|
||||
}
|
||||
|
@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty,
|
||||
__entry->ino = inode ? inode->i_ino : 0;
|
||||
__entry->memcg_id = wb->memcg_css->id;
|
||||
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
|
||||
__entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
|
||||
__entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup);
|
||||
),
|
||||
|
||||
TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
|
||||
|
@ -119,6 +119,9 @@
|
||||
|
||||
#define SO_DETACH_REUSEPORT_BPF 68
|
||||
|
||||
#define SO_PREFER_BUSY_POLL 69
|
||||
#define SO_BUSY_POLL_BUDGET 70
|
||||
|
||||
#if !defined(__KERNEL__)
|
||||
|
||||
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
|
||||
|
@ -557,7 +557,12 @@ union bpf_attr {
|
||||
__aligned_u64 line_info; /* line info */
|
||||
__u32 line_info_cnt; /* number of bpf_line_info records */
|
||||
__u32 attach_btf_id; /* in-kernel BTF type id to attach to */
|
||||
__u32 attach_prog_fd; /* 0 to attach to vmlinux */
|
||||
union {
|
||||
/* valid prog_fd to attach to bpf prog */
|
||||
__u32 attach_prog_fd;
|
||||
/* or valid module BTF object fd or 0 to attach to vmlinux */
|
||||
__u32 attach_btf_obj_fd;
|
||||
};
|
||||
};
|
||||
|
||||
struct { /* anonymous struct used by BPF_OBJ_* commands */
|
||||
@ -3787,6 +3792,36 @@ union bpf_attr {
|
||||
* *ARG_PTR_TO_BTF_ID* of type *task_struct*.
|
||||
* Return
|
||||
* Pointer to the current task.
|
||||
*
|
||||
* long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags)
|
||||
* Description
|
||||
* Set or clear certain options on *bprm*:
|
||||
*
|
||||
* **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit
|
||||
* which sets the **AT_SECURE** auxv for glibc. The bit
|
||||
* is cleared if the flag is not specified.
|
||||
* Return
|
||||
* **-EINVAL** if invalid *flags* are passed, zero otherwise.
|
||||
*
|
||||
* u64 bpf_ktime_get_coarse_ns(void)
|
||||
* Description
|
||||
* Return a coarse-grained version of the time elapsed since
|
||||
* system boot, in nanoseconds. Does not include time the system
|
||||
* was suspended.
|
||||
*
|
||||
* See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**)
|
||||
* Return
|
||||
* Current *ktime*.
|
||||
*
|
||||
* long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
|
||||
* Description
|
||||
* Returns the stored IMA hash of the *inode* (if it's avaialable).
|
||||
* If the hash is larger than *size*, then only *size*
|
||||
* bytes will be copied to *dst*
|
||||
* Return
|
||||
* The **hash_algo** is returned on success,
|
||||
* **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
|
||||
* invalid arguments are passed.
|
||||
*/
|
||||
#define __BPF_FUNC_MAPPER(FN) \
|
||||
FN(unspec), \
|
||||
@ -3948,6 +3983,9 @@ union bpf_attr {
|
||||
FN(task_storage_get), \
|
||||
FN(task_storage_delete), \
|
||||
FN(get_current_task_btf), \
|
||||
FN(bprm_opts_set), \
|
||||
FN(ktime_get_coarse_ns), \
|
||||
FN(ima_inode_hash), \
|
||||
/* */
|
||||
|
||||
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
||||
@ -4119,6 +4157,11 @@ enum bpf_lwt_encap_mode {
|
||||
BPF_LWT_ENCAP_IP,
|
||||
};
|
||||
|
||||
/* Flags for bpf_bprm_opts_set helper */
|
||||
enum {
|
||||
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
|
||||
};
|
||||
|
||||
#define __bpf_md_ptr(type, name) \
|
||||
union { \
|
||||
type name; \
|
||||
|
@ -34,8 +34,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
|
||||
int i;
|
||||
|
||||
for (i = 0; i < array->map.max_entries; i++) {
|
||||
ptr = __alloc_percpu_gfp(array->elem_size, 8,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
if (!ptr) {
|
||||
bpf_array_free_percpu(array);
|
||||
return -ENOMEM;
|
||||
@ -81,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr)
|
||||
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
|
||||
int ret, numa_node = bpf_map_attr_numa_node(attr);
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
u32 elem_size, index_mask, max_entries;
|
||||
bool bypass_spec_v1 = bpf_bypass_spec_v1();
|
||||
u64 cost, array_size, mask64;
|
||||
struct bpf_map_memory mem;
|
||||
u64 array_size, mask64;
|
||||
struct bpf_array *array;
|
||||
|
||||
elem_size = round_up(attr->value_size, 8);
|
||||
@ -126,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
|
||||
}
|
||||
}
|
||||
|
||||
/* make sure there is no u32 overflow later in round_up() */
|
||||
cost = array_size;
|
||||
if (percpu)
|
||||
cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
|
||||
|
||||
ret = bpf_map_charge_init(&mem, cost);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
/* allocate all map elements and zero-initialize them */
|
||||
if (attr->map_flags & BPF_F_MMAPABLE) {
|
||||
void *data;
|
||||
|
||||
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
|
||||
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
|
||||
if (!data) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!data)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
|
||||
- offsetof(struct bpf_array, value);
|
||||
} else {
|
||||
array = bpf_map_area_alloc(array_size, numa_node);
|
||||
}
|
||||
if (!array) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!array)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
array->index_mask = index_mask;
|
||||
array->map.bypass_spec_v1 = bypass_spec_v1;
|
||||
|
||||
/* copy mandatory map attributes */
|
||||
bpf_map_init_from_attr(&array->map, attr);
|
||||
bpf_map_charge_move(&array->map.memory, &mem);
|
||||
array->elem_size = elem_size;
|
||||
|
||||
if (percpu && bpf_array_alloc_percpu(array)) {
|
||||
bpf_map_charge_finish(&array->map.memory);
|
||||
bpf_map_area_free(array);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
@ -1018,7 +1002,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
|
||||
struct bpf_array_aux *aux;
|
||||
struct bpf_map *map;
|
||||
|
||||
aux = kzalloc(sizeof(*aux), GFP_KERNEL);
|
||||
aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
|
||||
if (!aux)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
|
@ -67,7 +67,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
|
||||
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
|
||||
return NULL;
|
||||
|
||||
selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
|
||||
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (selem) {
|
||||
if (value)
|
||||
memcpy(SDATA(selem)->data, value, smap->map.value_size);
|
||||
@ -264,7 +265,8 @@ int bpf_local_storage_alloc(void *owner,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
|
||||
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!storage) {
|
||||
err = -ENOMEM;
|
||||
goto uncharge;
|
||||
@ -543,10 +545,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
|
||||
struct bpf_local_storage_map *smap;
|
||||
unsigned int i;
|
||||
u32 nbuckets;
|
||||
u64 cost;
|
||||
int ret;
|
||||
|
||||
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
|
||||
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
|
||||
if (!smap)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
bpf_map_init_from_attr(&smap->map, attr);
|
||||
@ -555,18 +555,10 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
|
||||
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
|
||||
nbuckets = max_t(u32, 2, nbuckets);
|
||||
smap->bucket_log = ilog2(nbuckets);
|
||||
cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
|
||||
|
||||
ret = bpf_map_charge_init(&smap->map.memory, cost);
|
||||
if (ret < 0) {
|
||||
kfree(smap);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
|
||||
if (!smap->buckets) {
|
||||
bpf_map_charge_finish(&smap->map.memory);
|
||||
kfree(smap);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/filter.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/binfmts.h>
|
||||
#include <linux/lsm_hooks.h>
|
||||
#include <linux/bpf_lsm.h>
|
||||
#include <linux/kallsyms.h>
|
||||
@ -14,6 +15,7 @@
|
||||
#include <net/bpf_sk_storage.h>
|
||||
#include <linux/bpf_local_storage.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/ima.h>
|
||||
|
||||
/* For every LSM hook that allows attachment of BPF programs, declare a nop
|
||||
* function where a BPF program can be attached.
|
||||
@ -51,6 +53,52 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mask for all the currently supported BPRM option flags */
|
||||
#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
|
||||
|
||||
BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
|
||||
{
|
||||
if (flags & ~BPF_F_BRPM_OPTS_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
|
||||
return 0;
|
||||
}
|
||||
|
||||
BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
|
||||
|
||||
const static struct bpf_func_proto bpf_bprm_opts_set_proto = {
|
||||
.func = bpf_bprm_opts_set,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
|
||||
{
|
||||
return ima_inode_hash(inode, dst, size);
|
||||
}
|
||||
|
||||
static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
|
||||
{
|
||||
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
|
||||
}
|
||||
|
||||
BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
|
||||
|
||||
const static struct bpf_func_proto bpf_ima_inode_hash_proto = {
|
||||
.func = bpf_ima_inode_hash,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
|
||||
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.allowed = bpf_ima_inode_hash_allowed,
|
||||
};
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@ -71,6 +119,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
return &bpf_task_storage_get_proto;
|
||||
case BPF_FUNC_task_storage_delete:
|
||||
return &bpf_task_storage_delete_proto;
|
||||
case BPF_FUNC_bprm_opts_set:
|
||||
return &bpf_bprm_opts_set_proto;
|
||||
case BPF_FUNC_ima_inode_hash:
|
||||
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
|
||||
default:
|
||||
return tracing_prog_func_proto(func_id, prog);
|
||||
}
|
||||
|
@ -548,12 +548,10 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
|
||||
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
const struct bpf_struct_ops *st_ops;
|
||||
size_t map_total_size, st_map_size;
|
||||
size_t st_map_size;
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
const struct btf_type *t, *vt;
|
||||
struct bpf_map_memory mem;
|
||||
struct bpf_map *map;
|
||||
int err;
|
||||
|
||||
if (!bpf_capable())
|
||||
return ERR_PTR(-EPERM);
|
||||
@ -573,20 +571,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
|
||||
* struct bpf_struct_ops_tcp_congestions_ops
|
||||
*/
|
||||
(vt->size - sizeof(struct bpf_struct_ops_value));
|
||||
map_total_size = st_map_size +
|
||||
/* uvalue */
|
||||
sizeof(vt->size) +
|
||||
/* struct bpf_progs **progs */
|
||||
btf_type_vlen(t) * sizeof(struct bpf_prog *);
|
||||
err = bpf_map_charge_init(&mem, map_total_size);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
|
||||
if (!st_map) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!st_map)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
st_map->st_ops = st_ops;
|
||||
map = &st_map->map;
|
||||
|
||||
@ -597,14 +586,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
|
||||
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
|
||||
if (!st_map->uvalue || !st_map->progs || !st_map->image) {
|
||||
bpf_struct_ops_map_free(map);
|
||||
bpf_map_charge_finish(&mem);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
mutex_init(&st_map->lock);
|
||||
set_vm_flush_reset_perms(st_map->image);
|
||||
bpf_map_init_from_attr(map, attr);
|
||||
bpf_map_charge_move(&map->memory, &mem);
|
||||
|
||||
return map;
|
||||
}
|
||||
|
@ -1524,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
|
||||
btf_free(btf);
|
||||
}
|
||||
|
||||
void btf_get(struct btf *btf)
|
||||
{
|
||||
refcount_inc(&btf->refcnt);
|
||||
}
|
||||
|
||||
void btf_put(struct btf *btf)
|
||||
{
|
||||
if (btf && refcount_dec_and_test(&btf->refcnt)) {
|
||||
@ -4555,11 +4560,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_prog *tgt_prog = prog->aux->dst_prog;
|
||||
|
||||
if (tgt_prog) {
|
||||
if (tgt_prog)
|
||||
return tgt_prog->aux->btf;
|
||||
} else {
|
||||
return btf_vmlinux;
|
||||
}
|
||||
else
|
||||
return prog->aux->attach_btf;
|
||||
}
|
||||
|
||||
static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
|
||||
@ -4700,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
|
||||
if (ctx_arg_info->offset == off) {
|
||||
info->reg_type = ctx_arg_info->reg_type;
|
||||
info->btf = btf_vmlinux;
|
||||
info->btf_id = ctx_arg_info->btf_id;
|
||||
return true;
|
||||
}
|
||||
@ -4716,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
|
||||
ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
|
||||
if (ret > 0) {
|
||||
info->btf = btf_vmlinux;
|
||||
info->btf_id = ret;
|
||||
return true;
|
||||
} else {
|
||||
@ -4723,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
}
|
||||
}
|
||||
|
||||
info->btf = btf;
|
||||
info->btf_id = t->type;
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
/* skip modifiers */
|
||||
@ -4749,7 +4756,7 @@ enum bpf_struct_walk_result {
|
||||
WALK_STRUCT,
|
||||
};
|
||||
|
||||
static int btf_struct_walk(struct bpf_verifier_log *log,
|
||||
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
|
||||
const struct btf_type *t, int off, int size,
|
||||
u32 *next_btf_id)
|
||||
{
|
||||
@ -4760,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log,
|
||||
u32 vlen, elem_id, mid;
|
||||
|
||||
again:
|
||||
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
|
||||
tname = __btf_name_by_offset(btf, t->name_off);
|
||||
if (!btf_type_is_struct(t)) {
|
||||
bpf_log(log, "Type '%s' is not a struct\n", tname);
|
||||
return -EINVAL;
|
||||
@ -4777,7 +4784,7 @@ again:
|
||||
goto error;
|
||||
|
||||
member = btf_type_member(t) + vlen - 1;
|
||||
mtype = btf_type_skip_modifiers(btf_vmlinux, member->type,
|
||||
mtype = btf_type_skip_modifiers(btf, member->type,
|
||||
NULL);
|
||||
if (!btf_type_is_array(mtype))
|
||||
goto error;
|
||||
@ -4793,7 +4800,7 @@ again:
|
||||
/* Only allow structure for now, can be relaxed for
|
||||
* other types later.
|
||||
*/
|
||||
t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type,
|
||||
t = btf_type_skip_modifiers(btf, array_elem->type,
|
||||
NULL);
|
||||
if (!btf_type_is_struct(t))
|
||||
goto error;
|
||||
@ -4851,10 +4858,10 @@ error:
|
||||
|
||||
/* type of the field */
|
||||
mid = member->type;
|
||||
mtype = btf_type_by_id(btf_vmlinux, member->type);
|
||||
mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
|
||||
mtype = btf_type_by_id(btf, member->type);
|
||||
mname = __btf_name_by_offset(btf, member->name_off);
|
||||
|
||||
mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize,
|
||||
mtype = __btf_resolve_size(btf, mtype, &msize,
|
||||
&elem_type, &elem_id, &total_nelems,
|
||||
&mid);
|
||||
if (IS_ERR(mtype)) {
|
||||
@ -4949,7 +4956,7 @@ error:
|
||||
mname, moff, tname, off, size);
|
||||
return -EACCES;
|
||||
}
|
||||
stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
|
||||
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
|
||||
if (btf_type_is_struct(stype)) {
|
||||
*next_btf_id = id;
|
||||
return WALK_PTR;
|
||||
@ -4975,7 +4982,7 @@ error:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int btf_struct_access(struct bpf_verifier_log *log,
|
||||
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
|
||||
const struct btf_type *t, int off, int size,
|
||||
enum bpf_access_type atype __maybe_unused,
|
||||
u32 *next_btf_id)
|
||||
@ -4984,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
|
||||
u32 id;
|
||||
|
||||
do {
|
||||
err = btf_struct_walk(log, t, off, size, &id);
|
||||
err = btf_struct_walk(log, btf, t, off, size, &id);
|
||||
|
||||
switch (err) {
|
||||
case WALK_PTR:
|
||||
@ -5000,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
|
||||
* by diving in it. At this point the offset is
|
||||
* aligned with the new type, so set it to 0.
|
||||
*/
|
||||
t = btf_type_by_id(btf_vmlinux, id);
|
||||
t = btf_type_by_id(btf, id);
|
||||
off = 0;
|
||||
break;
|
||||
default:
|
||||
@ -5016,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Check that two BTF types, each specified as an BTF object + id, are exactly
|
||||
* the same. Trivial ID check is not enough due to module BTFs, because we can
|
||||
* end up with two different module BTFs, but IDs point to the common type in
|
||||
* vmlinux BTF.
|
||||
*/
|
||||
static bool btf_types_are_same(const struct btf *btf1, u32 id1,
|
||||
const struct btf *btf2, u32 id2)
|
||||
{
|
||||
if (id1 != id2)
|
||||
return false;
|
||||
if (btf1 == btf2)
|
||||
return true;
|
||||
return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
|
||||
}
|
||||
|
||||
bool btf_struct_ids_match(struct bpf_verifier_log *log,
|
||||
int off, u32 id, u32 need_type_id)
|
||||
const struct btf *btf, u32 id, int off,
|
||||
const struct btf *need_btf, u32 need_type_id)
|
||||
{
|
||||
const struct btf_type *type;
|
||||
int err;
|
||||
|
||||
/* Are we already done? */
|
||||
if (need_type_id == id && off == 0)
|
||||
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
|
||||
return true;
|
||||
|
||||
again:
|
||||
type = btf_type_by_id(btf_vmlinux, id);
|
||||
type = btf_type_by_id(btf, id);
|
||||
if (!type)
|
||||
return false;
|
||||
err = btf_struct_walk(log, type, off, 1, &id);
|
||||
err = btf_struct_walk(log, btf, type, off, 1, &id);
|
||||
if (err != WALK_STRUCT)
|
||||
return false;
|
||||
|
||||
@ -5039,7 +5062,7 @@ again:
|
||||
* continue the search with offset 0 in the new
|
||||
* type.
|
||||
*/
|
||||
if (need_type_id != id) {
|
||||
if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
|
||||
off = 0;
|
||||
goto again;
|
||||
}
|
||||
@ -5710,11 +5733,16 @@ int btf_get_fd_by_id(u32 id)
|
||||
return fd;
|
||||
}
|
||||
|
||||
u32 btf_id(const struct btf *btf)
|
||||
u32 btf_obj_id(const struct btf *btf)
|
||||
{
|
||||
return btf->id;
|
||||
}
|
||||
|
||||
bool btf_is_kernel(const struct btf *btf)
|
||||
{
|
||||
return btf->kernel_btf;
|
||||
}
|
||||
|
||||
static int btf_id_cmp_func(const void *a, const void *b)
|
||||
{
|
||||
const int *pa = a, *pb = b;
|
||||
|
@ -77,7 +77,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
|
||||
|
||||
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
|
||||
{
|
||||
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
|
||||
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
|
||||
struct bpf_prog_aux *aux;
|
||||
struct bpf_prog *fp;
|
||||
|
||||
@ -86,7 +86,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
|
||||
if (fp == NULL)
|
||||
return NULL;
|
||||
|
||||
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
|
||||
aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
|
||||
if (aux == NULL) {
|
||||
vfree(fp);
|
||||
return NULL;
|
||||
@ -106,7 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
|
||||
|
||||
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
|
||||
{
|
||||
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
|
||||
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
|
||||
struct bpf_prog *prog;
|
||||
int cpu;
|
||||
|
||||
@ -138,7 +138,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
|
||||
|
||||
prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
|
||||
sizeof(*prog->aux->jited_linfo),
|
||||
GFP_KERNEL | __GFP_NOWARN);
|
||||
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
|
||||
if (!prog->aux->jited_linfo)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -219,25 +219,17 @@ void bpf_prog_free_linfo(struct bpf_prog *prog)
|
||||
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
|
||||
gfp_t gfp_extra_flags)
|
||||
{
|
||||
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
|
||||
gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
|
||||
struct bpf_prog *fp;
|
||||
u32 pages, delta;
|
||||
int ret;
|
||||
u32 pages;
|
||||
|
||||
size = round_up(size, PAGE_SIZE);
|
||||
pages = size / PAGE_SIZE;
|
||||
if (pages <= fp_old->pages)
|
||||
return fp_old;
|
||||
|
||||
delta = pages - fp_old->pages;
|
||||
ret = __bpf_prog_charge(fp_old->aux->user, delta);
|
||||
if (ret)
|
||||
return NULL;
|
||||
|
||||
fp = __vmalloc(size, gfp_flags);
|
||||
if (fp == NULL) {
|
||||
__bpf_prog_uncharge(fp_old->aux->user, delta);
|
||||
} else {
|
||||
if (fp) {
|
||||
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
|
||||
fp->pages = pages;
|
||||
fp->aux->prog = fp;
|
||||
@ -2211,6 +2203,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
|
||||
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
|
||||
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
|
||||
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
|
||||
|
||||
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
|
||||
|
@ -84,8 +84,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
u32 value_size = attr->value_size;
|
||||
struct bpf_cpu_map *cmap;
|
||||
int err = -ENOMEM;
|
||||
u64 cost;
|
||||
int ret;
|
||||
|
||||
if (!bpf_capable())
|
||||
return ERR_PTR(-EPERM);
|
||||
@ -97,7 +95,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
attr->map_flags & ~BPF_F_NUMA_NODE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
cmap = kzalloc(sizeof(*cmap), GFP_USER);
|
||||
cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!cmap)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -109,26 +107,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
|
||||
goto free_cmap;
|
||||
}
|
||||
|
||||
/* make sure page count doesn't overflow */
|
||||
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
|
||||
|
||||
/* Notice returns -EPERM on if map size is larger than memlock limit */
|
||||
ret = bpf_map_charge_init(&cmap->map.memory, cost);
|
||||
if (ret) {
|
||||
err = ret;
|
||||
goto free_cmap;
|
||||
}
|
||||
|
||||
/* Alloc array for possible remote "destination" CPUs */
|
||||
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
|
||||
sizeof(struct bpf_cpu_map_entry *),
|
||||
cmap->map.numa_node);
|
||||
if (!cmap->cpu_map)
|
||||
goto free_charge;
|
||||
goto free_cmap;
|
||||
|
||||
return &cmap->map;
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&cmap->map.memory);
|
||||
free_cmap:
|
||||
kfree(cmap);
|
||||
return ERR_PTR(err);
|
||||
@ -412,7 +398,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
|
||||
}
|
||||
|
||||
static struct bpf_cpu_map_entry *
|
||||
__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
|
||||
__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
|
||||
u32 cpu)
|
||||
{
|
||||
int numa, err, i, fd = value->bpf_prog.fd;
|
||||
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
|
||||
@ -422,13 +409,13 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
|
||||
/* Have map->numa_node, but choose node of redirect target CPU */
|
||||
numa = cpu_to_node(cpu);
|
||||
|
||||
rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
|
||||
rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
|
||||
if (!rcpu)
|
||||
return NULL;
|
||||
|
||||
/* Alloc percpu bulkq */
|
||||
rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
|
||||
sizeof(void *), gfp);
|
||||
rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
|
||||
sizeof(void *), gfp);
|
||||
if (!rcpu->bulkq)
|
||||
goto free_rcu;
|
||||
|
||||
@ -438,7 +425,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
|
||||
}
|
||||
|
||||
/* Alloc queue */
|
||||
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
|
||||
rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp,
|
||||
numa);
|
||||
if (!rcpu->queue)
|
||||
goto free_bulkq;
|
||||
|
||||
@ -447,7 +435,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
|
||||
goto free_queue;
|
||||
|
||||
rcpu->cpu = cpu;
|
||||
rcpu->map_id = map_id;
|
||||
rcpu->map_id = map->id;
|
||||
rcpu->value.qsize = value->qsize;
|
||||
|
||||
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
|
||||
@ -455,7 +443,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
|
||||
|
||||
/* Setup kthread */
|
||||
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
|
||||
"cpumap/%d/map:%d", cpu, map_id);
|
||||
"cpumap/%d/map:%d", cpu,
|
||||
map->id);
|
||||
if (IS_ERR(rcpu->kthread))
|
||||
goto free_prog;
|
||||
|
||||
@ -571,7 +560,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||
rcpu = NULL; /* Same as deleting */
|
||||
} else {
|
||||
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
|
||||
rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id);
|
||||
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
|
||||
if (!rcpu)
|
||||
return -ENOMEM;
|
||||
rcpu->cmap = cmap;
|
||||
|
@ -109,8 +109,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
|
||||
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
{
|
||||
u32 valsize = attr->value_size;
|
||||
u64 cost = 0;
|
||||
int err;
|
||||
|
||||
/* check sanity of attributes. 2 value sizes supported:
|
||||
* 4 bytes: ifindex
|
||||
@ -135,21 +133,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
|
||||
if (!dtab->n_buckets) /* Overflow check */
|
||||
return -EINVAL;
|
||||
cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
|
||||
} else {
|
||||
cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
|
||||
}
|
||||
|
||||
/* if map size is larger than memlock limit, reject it */
|
||||
err = bpf_map_charge_init(&dtab->map.memory, cost);
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
|
||||
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
|
||||
dtab->map.numa_node);
|
||||
if (!dtab->dev_index_head)
|
||||
goto free_charge;
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&dtab->index_lock);
|
||||
} else {
|
||||
@ -157,14 +147,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
|
||||
sizeof(struct bpf_dtab_netdev *),
|
||||
dtab->map.numa_node);
|
||||
if (!dtab->netdev_map)
|
||||
goto free_charge;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&dtab->map.memory);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
|
||||
@ -175,7 +161,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
dtab = kzalloc(sizeof(*dtab), GFP_USER);
|
||||
dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!dtab)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -602,8 +588,9 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
|
||||
struct bpf_prog *prog = NULL;
|
||||
struct bpf_dtab_netdev *dev;
|
||||
|
||||
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
|
||||
dtab->map.numa_node);
|
||||
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
|
||||
GFP_ATOMIC | __GFP_NOWARN,
|
||||
dtab->map.numa_node);
|
||||
if (!dev)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
|
@ -292,7 +292,8 @@ static int prealloc_init(struct bpf_htab *htab)
|
||||
u32 size = round_up(htab->map.value_size, 8);
|
||||
void __percpu *pptr;
|
||||
|
||||
pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
|
||||
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
if (!pptr)
|
||||
goto free_elems;
|
||||
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
|
||||
@ -346,8 +347,8 @@ static int alloc_extra_elems(struct bpf_htab *htab)
|
||||
struct pcpu_freelist_node *l;
|
||||
int cpu;
|
||||
|
||||
pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
if (!pptr)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -442,9 +443,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
|
||||
struct bpf_htab *htab;
|
||||
int err, i;
|
||||
u64 cost;
|
||||
|
||||
htab = kzalloc(sizeof(*htab), GFP_USER);
|
||||
htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!htab)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -480,30 +480,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
|
||||
goto free_htab;
|
||||
|
||||
cost = (u64) htab->n_buckets * sizeof(struct bucket) +
|
||||
(u64) htab->elem_size * htab->map.max_entries;
|
||||
|
||||
if (percpu)
|
||||
cost += (u64) round_up(htab->map.value_size, 8) *
|
||||
num_possible_cpus() * htab->map.max_entries;
|
||||
else
|
||||
cost += (u64) htab->elem_size * num_possible_cpus();
|
||||
|
||||
/* if map size is larger than memlock limit, reject it */
|
||||
err = bpf_map_charge_init(&htab->map.memory, cost);
|
||||
if (err)
|
||||
goto free_htab;
|
||||
|
||||
err = -ENOMEM;
|
||||
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
|
||||
sizeof(struct bucket),
|
||||
htab->map.numa_node);
|
||||
if (!htab->buckets)
|
||||
goto free_charge;
|
||||
goto free_htab;
|
||||
|
||||
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
|
||||
htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int),
|
||||
sizeof(int), GFP_USER);
|
||||
htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
|
||||
sizeof(int),
|
||||
sizeof(int),
|
||||
GFP_USER);
|
||||
if (!htab->map_locked[i])
|
||||
goto free_map_locked;
|
||||
}
|
||||
@ -538,8 +526,6 @@ free_map_locked:
|
||||
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
|
||||
free_percpu(htab->map_locked[i]);
|
||||
bpf_map_area_free(htab->buckets);
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&htab->map.memory);
|
||||
free_htab:
|
||||
lockdep_unregister_key(&htab->lockdep_key);
|
||||
kfree(htab);
|
||||
@ -925,8 +911,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||
l_new = ERR_PTR(-E2BIG);
|
||||
goto dec_count;
|
||||
}
|
||||
l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
|
||||
htab->map.numa_node);
|
||||
l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
|
||||
GFP_ATOMIC | __GFP_NOWARN,
|
||||
htab->map.numa_node);
|
||||
if (!l_new) {
|
||||
l_new = ERR_PTR(-ENOMEM);
|
||||
goto dec_count;
|
||||
@ -942,8 +929,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||
pptr = htab_elem_get_ptr(l_new, key_size);
|
||||
} else {
|
||||
/* alloc_percpu zero-fills */
|
||||
pptr = __alloc_percpu_gfp(size, 8,
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!pptr) {
|
||||
kfree(l_new);
|
||||
l_new = ERR_PTR(-ENOMEM);
|
||||
|
@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
};
|
||||
|
||||
BPF_CALL_0(bpf_ktime_get_coarse_ns)
|
||||
{
|
||||
return ktime_get_coarse_ns();
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
|
||||
.func = bpf_ktime_get_coarse_ns,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
};
|
||||
|
||||
BPF_CALL_0(bpf_get_current_pid_tgid)
|
||||
{
|
||||
struct task_struct *task = current;
|
||||
@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
|
||||
return &bpf_ktime_get_ns_proto;
|
||||
case BPF_FUNC_ktime_get_boot_ns:
|
||||
return &bpf_ktime_get_boot_ns_proto;
|
||||
case BPF_FUNC_ktime_get_coarse_ns:
|
||||
return &bpf_ktime_get_coarse_ns_proto;
|
||||
case BPF_FUNC_ringbuf_output:
|
||||
return &bpf_ringbuf_output_proto;
|
||||
case BPF_FUNC_ringbuf_reserve:
|
||||
|
@ -164,10 +164,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
|
||||
return 0;
|
||||
}
|
||||
|
||||
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
|
||||
map->value_size,
|
||||
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
|
||||
map->numa_node);
|
||||
new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
|
||||
map->value_size,
|
||||
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
|
||||
map->numa_node);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -287,8 +287,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct bpf_cgroup_storage_map *map;
|
||||
struct bpf_map_memory mem;
|
||||
int ret;
|
||||
|
||||
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
|
||||
attr->key_size != sizeof(__u64))
|
||||
@ -308,18 +306,10 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
|
||||
/* max_entries is not used and enforced to be 0 */
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
|
||||
__GFP_ZERO | GFP_USER, numa_node);
|
||||
if (!map) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
__GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
|
||||
if (!map)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
bpf_map_charge_move(&map->map.memory, &mem);
|
||||
|
||||
/* copy mandatory map attributes */
|
||||
bpf_map_init_from_attr(&map->map, attr);
|
||||
@ -496,9 +486,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
|
||||
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
|
||||
enum bpf_cgroup_storage_type stype)
|
||||
{
|
||||
const gfp_t gfp = __GFP_ZERO | GFP_USER;
|
||||
struct bpf_cgroup_storage *storage;
|
||||
struct bpf_map *map;
|
||||
gfp_t flags;
|
||||
size_t size;
|
||||
u32 pages;
|
||||
|
||||
@ -508,23 +498,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
|
||||
|
||||
size = bpf_cgroup_storage_calculate_size(map, &pages);
|
||||
|
||||
if (bpf_map_charge_memlock(map, pages))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
|
||||
__GFP_ZERO | GFP_USER, map->numa_node);
|
||||
storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage),
|
||||
gfp, map->numa_node);
|
||||
if (!storage)
|
||||
goto enomem;
|
||||
|
||||
flags = __GFP_ZERO | GFP_USER;
|
||||
|
||||
if (stype == BPF_CGROUP_STORAGE_SHARED) {
|
||||
storage->buf = kmalloc_node(size, flags, map->numa_node);
|
||||
storage->buf = bpf_map_kmalloc_node(map, size, gfp,
|
||||
map->numa_node);
|
||||
if (!storage->buf)
|
||||
goto enomem;
|
||||
check_and_init_map_lock(map, storage->buf->data);
|
||||
} else {
|
||||
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
|
||||
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
|
||||
if (!storage->percpu_buf)
|
||||
goto enomem;
|
||||
}
|
||||
@ -534,7 +520,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
|
||||
return storage;
|
||||
|
||||
enomem:
|
||||
bpf_map_uncharge_memlock(map, pages);
|
||||
kfree(storage);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
@ -561,16 +546,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
|
||||
{
|
||||
enum bpf_cgroup_storage_type stype;
|
||||
struct bpf_map *map;
|
||||
u32 pages;
|
||||
|
||||
if (!storage)
|
||||
return;
|
||||
|
||||
map = &storage->map->map;
|
||||
|
||||
bpf_cgroup_storage_calculate_size(map, &pages);
|
||||
bpf_map_uncharge_memlock(map, pages);
|
||||
|
||||
stype = cgroup_storage_type(map);
|
||||
if (stype == BPF_CGROUP_STORAGE_SHARED)
|
||||
call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
|
||||
|
@ -282,8 +282,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
|
||||
if (value)
|
||||
size += trie->map.value_size;
|
||||
|
||||
node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
|
||||
trie->map.numa_node);
|
||||
node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
|
||||
trie->map.numa_node);
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
@ -540,8 +540,6 @@ out:
|
||||
static struct bpf_map *trie_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct lpm_trie *trie;
|
||||
u64 cost = sizeof(*trie), cost_per_node;
|
||||
int ret;
|
||||
|
||||
if (!bpf_capable())
|
||||
return ERR_PTR(-EPERM);
|
||||
@ -557,7 +555,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
|
||||
attr->value_size > LPM_VAL_SIZE_MAX)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
|
||||
trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
|
||||
if (!trie)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -567,20 +565,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
|
||||
offsetof(struct bpf_lpm_trie_key, data);
|
||||
trie->max_prefixlen = trie->data_size * 8;
|
||||
|
||||
cost_per_node = sizeof(struct lpm_trie_node) +
|
||||
attr->value_size + trie->data_size;
|
||||
cost += (u64) attr->max_entries * cost_per_node;
|
||||
|
||||
ret = bpf_map_charge_init(&trie->map.memory, cost);
|
||||
if (ret)
|
||||
goto out_err;
|
||||
|
||||
spin_lock_init(&trie->lock);
|
||||
|
||||
return &trie->map;
|
||||
out_err:
|
||||
kfree(trie);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static void trie_free(struct bpf_map *map)
|
||||
|
@ -66,29 +66,21 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
|
||||
|
||||
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
int ret, numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct bpf_map_memory mem = {0};
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct bpf_queue_stack *qs;
|
||||
u64 size, queue_size, cost;
|
||||
u64 size, queue_size;
|
||||
|
||||
size = (u64) attr->max_entries + 1;
|
||||
cost = queue_size = sizeof(*qs) + size * attr->value_size;
|
||||
|
||||
ret = bpf_map_charge_init(&mem, cost);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
queue_size = sizeof(*qs) + size * attr->value_size;
|
||||
|
||||
qs = bpf_map_area_alloc(queue_size, numa_node);
|
||||
if (!qs) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!qs)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
memset(qs, 0, sizeof(*qs));
|
||||
|
||||
bpf_map_init_from_attr(&qs->map, attr);
|
||||
|
||||
bpf_map_charge_move(&qs->map.memory, &mem);
|
||||
qs->size = size;
|
||||
|
||||
raw_spin_lock_init(&qs->lock);
|
||||
|
@ -150,9 +150,8 @@ static void reuseport_array_free(struct bpf_map *map)
|
||||
|
||||
static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
|
||||
{
|
||||
int err, numa_node = bpf_map_attr_numa_node(attr);
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct reuseport_array *array;
|
||||
struct bpf_map_memory mem;
|
||||
u64 array_size;
|
||||
|
||||
if (!bpf_capable())
|
||||
@ -161,20 +160,13 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
|
||||
array_size = sizeof(*array);
|
||||
array_size += (u64)attr->max_entries * sizeof(struct sock *);
|
||||
|
||||
err = bpf_map_charge_init(&mem, array_size);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
/* allocate all map elements and zero-initialize them */
|
||||
array = bpf_map_area_alloc(array_size, numa_node);
|
||||
if (!array) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!array)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* copy mandatory map attributes */
|
||||
bpf_map_init_from_attr(&array->map, attr);
|
||||
bpf_map_charge_move(&array->map.memory, &mem);
|
||||
|
||||
return &array->map;
|
||||
}
|
||||
|
@ -48,7 +48,6 @@ struct bpf_ringbuf {
|
||||
|
||||
struct bpf_ringbuf_map {
|
||||
struct bpf_map map;
|
||||
struct bpf_map_memory memory;
|
||||
struct bpf_ringbuf *rb;
|
||||
};
|
||||
|
||||
@ -60,8 +59,8 @@ struct bpf_ringbuf_hdr {
|
||||
|
||||
static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
|
||||
{
|
||||
const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
|
||||
__GFP_ZERO;
|
||||
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
|
||||
__GFP_NOWARN | __GFP_ZERO;
|
||||
int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
|
||||
int nr_data_pages = data_sz >> PAGE_SHIFT;
|
||||
int nr_pages = nr_meta_pages + nr_data_pages;
|
||||
@ -88,10 +87,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
|
||||
* user-space implementations significantly.
|
||||
*/
|
||||
array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
|
||||
if (array_size > PAGE_SIZE)
|
||||
pages = vmalloc_node(array_size, numa_node);
|
||||
else
|
||||
pages = kmalloc_node(array_size, flags, numa_node);
|
||||
pages = bpf_map_area_alloc(array_size, numa_node);
|
||||
if (!pages)
|
||||
return NULL;
|
||||
|
||||
@ -134,7 +130,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
|
||||
|
||||
rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
|
||||
if (!rb)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
return NULL;
|
||||
|
||||
spin_lock_init(&rb->spinlock);
|
||||
init_waitqueue_head(&rb->waitq);
|
||||
@ -150,8 +146,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
|
||||
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_ringbuf_map *rb_map;
|
||||
u64 cost;
|
||||
int err;
|
||||
|
||||
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
|
||||
return ERR_PTR(-EINVAL);
|
||||
@ -167,32 +161,19 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
|
||||
return ERR_PTR(-E2BIG);
|
||||
#endif
|
||||
|
||||
rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
|
||||
rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!rb_map)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
bpf_map_init_from_attr(&rb_map->map, attr);
|
||||
|
||||
cost = sizeof(struct bpf_ringbuf_map) +
|
||||
sizeof(struct bpf_ringbuf) +
|
||||
attr->max_entries;
|
||||
err = bpf_map_charge_init(&rb_map->map.memory, cost);
|
||||
if (err)
|
||||
goto err_free_map;
|
||||
|
||||
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
|
||||
if (IS_ERR(rb_map->rb)) {
|
||||
err = PTR_ERR(rb_map->rb);
|
||||
goto err_uncharge;
|
||||
if (!rb_map->rb) {
|
||||
kfree(rb_map);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
return &rb_map->map;
|
||||
|
||||
err_uncharge:
|
||||
bpf_map_charge_finish(&rb_map->map.memory);
|
||||
err_free_map:
|
||||
kfree(rb_map);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
|
||||
|
@ -90,7 +90,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
u32 value_size = attr->value_size;
|
||||
struct bpf_stack_map *smap;
|
||||
struct bpf_map_memory mem;
|
||||
u64 cost, n_buckets;
|
||||
int err;
|
||||
|
||||
@ -119,15 +118,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
|
||||
|
||||
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
|
||||
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
|
||||
err = bpf_map_charge_init(&mem, cost);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
|
||||
if (!smap) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!smap)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
bpf_map_init_from_attr(&smap->map, attr);
|
||||
smap->map.value_size = value_size;
|
||||
@ -135,20 +128,17 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
|
||||
|
||||
err = get_callchain_buffers(sysctl_perf_event_max_stack);
|
||||
if (err)
|
||||
goto free_charge;
|
||||
goto free_smap;
|
||||
|
||||
err = prealloc_elems_and_freelist(smap);
|
||||
if (err)
|
||||
goto put_buffers;
|
||||
|
||||
bpf_map_charge_move(&smap->map.memory, &mem);
|
||||
|
||||
return &smap->map;
|
||||
|
||||
put_buffers:
|
||||
put_callchain_buffers();
|
||||
free_charge:
|
||||
bpf_map_charge_finish(&mem);
|
||||
free_smap:
|
||||
bpf_map_area_free(smap);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <linux/poll.h>
|
||||
#include <linux/bpf-netns.h>
|
||||
#include <linux/rcupdate_trace.h>
|
||||
#include <linux/memcontrol.h>
|
||||
|
||||
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
|
||||
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
|
||||
@ -127,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
|
||||
return map;
|
||||
}
|
||||
|
||||
static u32 bpf_map_value_size(struct bpf_map *map)
|
||||
static u32 bpf_map_value_size(const struct bpf_map *map)
|
||||
{
|
||||
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
|
||||
@ -267,6 +268,10 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Please, do not use this function outside from the map creation path
|
||||
* (e.g. in map update path) without taking care of setting the active
|
||||
* memory cgroup (see at bpf_map_kmalloc_node() for example).
|
||||
*/
|
||||
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
|
||||
{
|
||||
/* We really just want to fail instead of triggering OOM killer
|
||||
@ -279,7 +284,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
|
||||
* __GFP_RETRY_MAYFAIL to avoid such situations.
|
||||
*/
|
||||
|
||||
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
|
||||
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
|
||||
unsigned int flags = 0;
|
||||
unsigned long align = 1;
|
||||
void *area;
|
||||
@ -341,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
|
||||
map->numa_node = bpf_map_attr_numa_node(attr);
|
||||
}
|
||||
|
||||
static int bpf_charge_memlock(struct user_struct *user, u32 pages)
|
||||
{
|
||||
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
|
||||
if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
|
||||
atomic_long_sub(pages, &user->locked_vm);
|
||||
return -EPERM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
|
||||
{
|
||||
if (user)
|
||||
atomic_long_sub(pages, &user->locked_vm);
|
||||
}
|
||||
|
||||
int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
|
||||
{
|
||||
u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
|
||||
struct user_struct *user;
|
||||
int ret;
|
||||
|
||||
if (size >= U32_MAX - PAGE_SIZE)
|
||||
return -E2BIG;
|
||||
|
||||
user = get_current_user();
|
||||
ret = bpf_charge_memlock(user, pages);
|
||||
if (ret) {
|
||||
free_uid(user);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mem->pages = pages;
|
||||
mem->user = user;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_map_charge_finish(struct bpf_map_memory *mem)
|
||||
{
|
||||
bpf_uncharge_memlock(mem->user, mem->pages);
|
||||
free_uid(mem->user);
|
||||
}
|
||||
|
||||
void bpf_map_charge_move(struct bpf_map_memory *dst,
|
||||
struct bpf_map_memory *src)
|
||||
{
|
||||
*dst = *src;
|
||||
|
||||
/* Make sure src will not be used for the redundant uncharging. */
|
||||
memset(src, 0, sizeof(struct bpf_map_memory));
|
||||
}
|
||||
|
||||
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = bpf_charge_memlock(map->memory.user, pages);
|
||||
if (ret)
|
||||
return ret;
|
||||
map->memory.pages += pages;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
|
||||
{
|
||||
bpf_uncharge_memlock(map->memory.user, pages);
|
||||
map->memory.pages -= pages;
|
||||
}
|
||||
|
||||
static int bpf_map_alloc_id(struct bpf_map *map)
|
||||
{
|
||||
int id;
|
||||
@ -456,17 +390,74 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
|
||||
__release(&map_idr_lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static void bpf_map_save_memcg(struct bpf_map *map)
|
||||
{
|
||||
map->memcg = get_mem_cgroup_from_mm(current->mm);
|
||||
}
|
||||
|
||||
static void bpf_map_release_memcg(struct bpf_map *map)
|
||||
{
|
||||
mem_cgroup_put(map->memcg);
|
||||
}
|
||||
|
||||
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
|
||||
int node)
|
||||
{
|
||||
struct mem_cgroup *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
old_memcg = set_active_memcg(map->memcg);
|
||||
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
|
||||
set_active_memcg(old_memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
|
||||
{
|
||||
struct mem_cgroup *old_memcg;
|
||||
void *ptr;
|
||||
|
||||
old_memcg = set_active_memcg(map->memcg);
|
||||
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
|
||||
set_active_memcg(old_memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
|
||||
size_t align, gfp_t flags)
|
||||
{
|
||||
struct mem_cgroup *old_memcg;
|
||||
void __percpu *ptr;
|
||||
|
||||
old_memcg = set_active_memcg(map->memcg);
|
||||
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
|
||||
set_active_memcg(old_memcg);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
#else
|
||||
static void bpf_map_save_memcg(struct bpf_map *map)
|
||||
{
|
||||
}
|
||||
|
||||
static void bpf_map_release_memcg(struct bpf_map *map)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/* called from workqueue */
|
||||
static void bpf_map_free_deferred(struct work_struct *work)
|
||||
{
|
||||
struct bpf_map *map = container_of(work, struct bpf_map, work);
|
||||
struct bpf_map_memory mem;
|
||||
|
||||
bpf_map_charge_move(&mem, &map->memory);
|
||||
security_bpf_map_free(map);
|
||||
bpf_map_release_memcg(map);
|
||||
/* implementation dependent freeing */
|
||||
map->ops->map_free(map);
|
||||
bpf_map_charge_finish(&mem);
|
||||
}
|
||||
|
||||
static void bpf_map_put_uref(struct bpf_map *map)
|
||||
@ -527,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
/* Provides an approximation of the map's memory footprint.
|
||||
* Used only to provide a backward compatibility and display
|
||||
* a reasonable "memlock" info.
|
||||
*/
|
||||
static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
|
||||
{
|
||||
unsigned long size;
|
||||
|
||||
size = round_up(map->key_size + bpf_map_value_size(map), 8);
|
||||
|
||||
return round_up(map->max_entries * size, PAGE_SIZE);
|
||||
}
|
||||
|
||||
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
{
|
||||
const struct bpf_map *map = filp->private_data;
|
||||
@ -545,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
"value_size:\t%u\n"
|
||||
"max_entries:\t%u\n"
|
||||
"map_flags:\t%#x\n"
|
||||
"memlock:\t%llu\n"
|
||||
"memlock:\t%lu\n"
|
||||
"map_id:\t%u\n"
|
||||
"frozen:\t%u\n",
|
||||
map->map_type,
|
||||
@ -553,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
map->value_size,
|
||||
map->max_entries,
|
||||
map->map_flags,
|
||||
map->memory.pages * 1ULL << PAGE_SHIFT,
|
||||
bpf_map_memory_footprint(map),
|
||||
map->id,
|
||||
READ_ONCE(map->frozen));
|
||||
if (type) {
|
||||
@ -796,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
|
||||
static int map_create(union bpf_attr *attr)
|
||||
{
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct bpf_map_memory mem;
|
||||
struct bpf_map *map;
|
||||
int f_flags;
|
||||
int err;
|
||||
@ -875,6 +878,8 @@ static int map_create(union bpf_attr *attr)
|
||||
if (err)
|
||||
goto free_map_sec;
|
||||
|
||||
bpf_map_save_memcg(map);
|
||||
|
||||
err = bpf_map_new_fd(map, f_flags);
|
||||
if (err < 0) {
|
||||
/* failed to allocate fd.
|
||||
@ -893,9 +898,7 @@ free_map_sec:
|
||||
security_bpf_map_free(map);
|
||||
free_map:
|
||||
btf_put(map->btf);
|
||||
bpf_map_charge_move(&mem, &map->memory);
|
||||
map->ops->map_free(map);
|
||||
bpf_map_charge_finish(&mem);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -1629,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
|
||||
audit_log_end(ab);
|
||||
}
|
||||
|
||||
int __bpf_prog_charge(struct user_struct *user, u32 pages)
|
||||
{
|
||||
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
unsigned long user_bufs;
|
||||
|
||||
if (user) {
|
||||
user_bufs = atomic_long_add_return(pages, &user->locked_vm);
|
||||
if (user_bufs > memlock_limit) {
|
||||
atomic_long_sub(pages, &user->locked_vm);
|
||||
return -EPERM;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
|
||||
{
|
||||
if (user)
|
||||
atomic_long_sub(pages, &user->locked_vm);
|
||||
}
|
||||
|
||||
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
|
||||
{
|
||||
struct user_struct *user = get_current_user();
|
||||
int ret;
|
||||
|
||||
ret = __bpf_prog_charge(user, prog->pages);
|
||||
if (ret) {
|
||||
free_uid(user);
|
||||
return ret;
|
||||
}
|
||||
|
||||
prog->aux->user = user;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
|
||||
{
|
||||
struct user_struct *user = prog->aux->user;
|
||||
|
||||
__bpf_prog_uncharge(user, prog->pages);
|
||||
free_uid(user);
|
||||
}
|
||||
|
||||
static int bpf_prog_alloc_id(struct bpf_prog *prog)
|
||||
{
|
||||
int id;
|
||||
@ -1723,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
|
||||
|
||||
kvfree(aux->func_info);
|
||||
kfree(aux->func_info_aux);
|
||||
bpf_prog_uncharge_memlock(aux->prog);
|
||||
free_uid(aux->user);
|
||||
security_bpf_prog_free(aux);
|
||||
bpf_prog_free(aux->prog);
|
||||
}
|
||||
@ -1733,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
|
||||
bpf_prog_kallsyms_del_all(prog);
|
||||
btf_put(prog->aux->btf);
|
||||
bpf_prog_free_linfo(prog);
|
||||
if (prog->aux->attach_btf)
|
||||
btf_put(prog->aux->attach_btf);
|
||||
|
||||
if (deferred) {
|
||||
if (prog->aux->sleepable)
|
||||
@ -1966,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
|
||||
static int
|
||||
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
|
||||
enum bpf_attach_type expected_attach_type,
|
||||
u32 btf_id, u32 prog_fd)
|
||||
struct btf *attach_btf, u32 btf_id,
|
||||
struct bpf_prog *dst_prog)
|
||||
{
|
||||
if (btf_id) {
|
||||
if (btf_id > BTF_MAX_TYPE)
|
||||
return -EINVAL;
|
||||
|
||||
if (!attach_btf && !dst_prog)
|
||||
return -EINVAL;
|
||||
|
||||
switch (prog_type) {
|
||||
case BPF_PROG_TYPE_TRACING:
|
||||
case BPF_PROG_TYPE_LSM:
|
||||
@ -1983,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
|
||||
}
|
||||
}
|
||||
|
||||
if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
|
||||
if (attach_btf && (!btf_id || dst_prog))
|
||||
return -EINVAL;
|
||||
|
||||
if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
|
||||
prog_type != BPF_PROG_TYPE_EXT)
|
||||
return -EINVAL;
|
||||
|
||||
@ -2100,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
|
||||
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
|
||||
{
|
||||
enum bpf_prog_type type = attr->prog_type;
|
||||
struct bpf_prog *prog;
|
||||
struct bpf_prog *prog, *dst_prog = NULL;
|
||||
struct btf *attach_btf = NULL;
|
||||
int err;
|
||||
char license[128];
|
||||
bool is_gpl;
|
||||
@ -2142,47 +2110,70 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
|
||||
if (is_perfmon_prog_type(type) && !perfmon_capable())
|
||||
return -EPERM;
|
||||
|
||||
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
|
||||
* or btf, we need to check which one it is
|
||||
*/
|
||||
if (attr->attach_prog_fd) {
|
||||
dst_prog = bpf_prog_get(attr->attach_prog_fd);
|
||||
if (IS_ERR(dst_prog)) {
|
||||
dst_prog = NULL;
|
||||
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
|
||||
if (IS_ERR(attach_btf))
|
||||
return -EINVAL;
|
||||
if (!btf_is_kernel(attach_btf)) {
|
||||
btf_put(attach_btf);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
} else if (attr->attach_btf_id) {
|
||||
/* fall back to vmlinux BTF, if BTF type ID is specified */
|
||||
attach_btf = bpf_get_btf_vmlinux();
|
||||
if (IS_ERR(attach_btf))
|
||||
return PTR_ERR(attach_btf);
|
||||
if (!attach_btf)
|
||||
return -EINVAL;
|
||||
btf_get(attach_btf);
|
||||
}
|
||||
|
||||
bpf_prog_load_fixup_attach_type(attr);
|
||||
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
|
||||
attr->attach_btf_id,
|
||||
attr->attach_prog_fd))
|
||||
attach_btf, attr->attach_btf_id,
|
||||
dst_prog)) {
|
||||
if (dst_prog)
|
||||
bpf_prog_put(dst_prog);
|
||||
if (attach_btf)
|
||||
btf_put(attach_btf);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* plain bpf_prog allocation */
|
||||
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
|
||||
if (!prog)
|
||||
if (!prog) {
|
||||
if (dst_prog)
|
||||
bpf_prog_put(dst_prog);
|
||||
if (attach_btf)
|
||||
btf_put(attach_btf);
|
||||
return -ENOMEM;
|
||||
|
||||
prog->expected_attach_type = attr->expected_attach_type;
|
||||
prog->aux->attach_btf_id = attr->attach_btf_id;
|
||||
if (attr->attach_prog_fd) {
|
||||
struct bpf_prog *dst_prog;
|
||||
|
||||
dst_prog = bpf_prog_get(attr->attach_prog_fd);
|
||||
if (IS_ERR(dst_prog)) {
|
||||
err = PTR_ERR(dst_prog);
|
||||
goto free_prog_nouncharge;
|
||||
}
|
||||
prog->aux->dst_prog = dst_prog;
|
||||
}
|
||||
|
||||
prog->expected_attach_type = attr->expected_attach_type;
|
||||
prog->aux->attach_btf = attach_btf;
|
||||
prog->aux->attach_btf_id = attr->attach_btf_id;
|
||||
prog->aux->dst_prog = dst_prog;
|
||||
prog->aux->offload_requested = !!attr->prog_ifindex;
|
||||
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
|
||||
|
||||
err = security_bpf_prog_alloc(prog->aux);
|
||||
if (err)
|
||||
goto free_prog_nouncharge;
|
||||
|
||||
err = bpf_prog_charge_memlock(prog);
|
||||
if (err)
|
||||
goto free_prog_sec;
|
||||
goto free_prog;
|
||||
|
||||
prog->aux->user = get_current_user();
|
||||
prog->len = attr->insn_cnt;
|
||||
|
||||
err = -EFAULT;
|
||||
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
|
||||
bpf_prog_insn_size(prog)) != 0)
|
||||
goto free_prog;
|
||||
goto free_prog_sec;
|
||||
|
||||
prog->orig_prog = NULL;
|
||||
prog->jited = 0;
|
||||
@ -2193,19 +2184,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
|
||||
if (bpf_prog_is_dev_bound(prog->aux)) {
|
||||
err = bpf_prog_offload_init(prog, attr);
|
||||
if (err)
|
||||
goto free_prog;
|
||||
goto free_prog_sec;
|
||||
}
|
||||
|
||||
/* find program type: socket_filter vs tracing_filter */
|
||||
err = find_prog_type(type, prog);
|
||||
if (err < 0)
|
||||
goto free_prog;
|
||||
goto free_prog_sec;
|
||||
|
||||
prog->aux->load_time = ktime_get_boottime_ns();
|
||||
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
|
||||
sizeof(attr->prog_name));
|
||||
if (err < 0)
|
||||
goto free_prog;
|
||||
goto free_prog_sec;
|
||||
|
||||
/* run eBPF verifier */
|
||||
err = bpf_check(&prog, attr, uattr);
|
||||
@ -2250,11 +2241,12 @@ free_used_maps:
|
||||
*/
|
||||
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
|
||||
return err;
|
||||
free_prog:
|
||||
bpf_prog_uncharge_memlock(prog);
|
||||
free_prog_sec:
|
||||
free_uid(prog->aux->user);
|
||||
security_bpf_prog_free(prog->aux);
|
||||
free_prog_nouncharge:
|
||||
free_prog:
|
||||
if (prog->aux->attach_btf)
|
||||
btf_put(prog->aux->attach_btf);
|
||||
bpf_prog_free(prog);
|
||||
return err;
|
||||
}
|
||||
@ -2612,7 +2604,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
|
||||
goto out_put_prog;
|
||||
}
|
||||
|
||||
key = bpf_trampoline_compute_key(tgt_prog, btf_id);
|
||||
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
|
||||
}
|
||||
|
||||
link = kzalloc(sizeof(*link), GFP_USER);
|
||||
@ -3589,7 +3581,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
|
||||
}
|
||||
|
||||
if (prog->aux->btf)
|
||||
info.btf_id = btf_id(prog->aux->btf);
|
||||
info.btf_id = btf_obj_id(prog->aux->btf);
|
||||
|
||||
ulen = info.nr_func_info;
|
||||
info.nr_func_info = prog->aux->func_info_cnt;
|
||||
@ -3692,7 +3684,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
|
||||
memcpy(info.name, map->name, sizeof(map->name));
|
||||
|
||||
if (map->btf) {
|
||||
info.btf_id = btf_id(map->btf);
|
||||
info.btf_id = btf_obj_id(map->btf);
|
||||
info.btf_key_type_id = map->btf_key_type_id;
|
||||
info.btf_value_type_id = map->btf_value_type_id;
|
||||
}
|
||||
|
@ -136,8 +136,7 @@ struct bpf_iter_seq_task_file_info {
|
||||
};
|
||||
|
||||
static struct file *
|
||||
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
|
||||
struct task_struct **task, struct files_struct **fstruct)
|
||||
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
|
||||
{
|
||||
struct pid_namespace *ns = info->common.ns;
|
||||
u32 curr_tid = info->tid, max_fds;
|
||||
@ -150,14 +149,17 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
|
||||
* Otherwise, it does not hold any reference.
|
||||
*/
|
||||
again:
|
||||
if (*task) {
|
||||
curr_task = *task;
|
||||
curr_files = *fstruct;
|
||||
if (info->task) {
|
||||
curr_task = info->task;
|
||||
curr_files = info->files;
|
||||
curr_fd = info->fd;
|
||||
} else {
|
||||
curr_task = task_seq_get_next(ns, &curr_tid, true);
|
||||
if (!curr_task)
|
||||
if (!curr_task) {
|
||||
info->task = NULL;
|
||||
info->files = NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
curr_files = get_files_struct(curr_task);
|
||||
if (!curr_files) {
|
||||
@ -167,9 +169,8 @@ again:
|
||||
goto again;
|
||||
}
|
||||
|
||||
/* set *fstruct, *task and info->tid */
|
||||
*fstruct = curr_files;
|
||||
*task = curr_task;
|
||||
info->files = curr_files;
|
||||
info->task = curr_task;
|
||||
if (curr_tid == info->tid) {
|
||||
curr_fd = info->fd;
|
||||
} else {
|
||||
@ -199,8 +200,8 @@ again:
|
||||
rcu_read_unlock();
|
||||
put_files_struct(curr_files);
|
||||
put_task_struct(curr_task);
|
||||
*task = NULL;
|
||||
*fstruct = NULL;
|
||||
info->task = NULL;
|
||||
info->files = NULL;
|
||||
info->fd = 0;
|
||||
curr_tid = ++(info->tid);
|
||||
goto again;
|
||||
@ -209,21 +210,13 @@ again:
|
||||
static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct bpf_iter_seq_task_file_info *info = seq->private;
|
||||
struct files_struct *files = NULL;
|
||||
struct task_struct *task = NULL;
|
||||
struct file *file;
|
||||
|
||||
file = task_file_seq_get_next(info, &task, &files);
|
||||
if (!file) {
|
||||
info->files = NULL;
|
||||
info->task = NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (*pos == 0)
|
||||
info->task = NULL;
|
||||
info->files = NULL;
|
||||
file = task_file_seq_get_next(info);
|
||||
if (file && *pos == 0)
|
||||
++*pos;
|
||||
info->task = task;
|
||||
info->files = files;
|
||||
|
||||
return file;
|
||||
}
|
||||
@ -231,24 +224,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct bpf_iter_seq_task_file_info *info = seq->private;
|
||||
struct files_struct *files = info->files;
|
||||
struct task_struct *task = info->task;
|
||||
struct file *file;
|
||||
|
||||
++*pos;
|
||||
++info->fd;
|
||||
fput((struct file *)v);
|
||||
file = task_file_seq_get_next(info, &task, &files);
|
||||
if (!file) {
|
||||
info->files = NULL;
|
||||
info->task = NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
info->task = task;
|
||||
info->files = files;
|
||||
|
||||
return file;
|
||||
return task_file_seq_get_next(info);
|
||||
}
|
||||
|
||||
struct bpf_iter__task_file {
|
||||
|
@ -238,7 +238,9 @@ struct bpf_call_arg_meta {
|
||||
u64 msize_max_value;
|
||||
int ref_obj_id;
|
||||
int func_id;
|
||||
struct btf *btf;
|
||||
u32 btf_id;
|
||||
struct btf *ret_btf;
|
||||
u32 ret_btf_id;
|
||||
};
|
||||
|
||||
@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
|
||||
return cur->frame[reg->frameno];
|
||||
}
|
||||
|
||||
const char *kernel_type_name(u32 id)
|
||||
static const char *kernel_type_name(const struct btf* btf, u32 id)
|
||||
{
|
||||
return btf_name_by_offset(btf_vmlinux,
|
||||
btf_type_by_id(btf_vmlinux, id)->name_off);
|
||||
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
|
||||
}
|
||||
|
||||
static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
if (t == PTR_TO_BTF_ID ||
|
||||
t == PTR_TO_BTF_ID_OR_NULL ||
|
||||
t == PTR_TO_PERCPU_BTF_ID)
|
||||
verbose(env, "%s", kernel_type_name(reg->btf_id));
|
||||
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
|
||||
verbose(env, "(id=%d", reg->id);
|
||||
if (reg_type_may_be_refcounted_or_null(t))
|
||||
verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
|
||||
@ -1383,7 +1384,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
|
||||
|
||||
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
|
||||
struct bpf_reg_state *regs, u32 regno,
|
||||
enum bpf_reg_type reg_type, u32 btf_id)
|
||||
enum bpf_reg_type reg_type,
|
||||
struct btf *btf, u32 btf_id)
|
||||
{
|
||||
if (reg_type == SCALAR_VALUE) {
|
||||
mark_reg_unknown(env, regs, regno);
|
||||
@ -1391,6 +1393,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env,
|
||||
}
|
||||
mark_reg_known_zero(env, regs, regno);
|
||||
regs[regno].type = PTR_TO_BTF_ID;
|
||||
regs[regno].btf = btf;
|
||||
regs[regno].btf_id = btf_id;
|
||||
}
|
||||
|
||||
@ -2764,7 +2767,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
|
||||
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
|
||||
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
|
||||
enum bpf_access_type t, enum bpf_reg_type *reg_type,
|
||||
u32 *btf_id)
|
||||
struct btf **btf, u32 *btf_id)
|
||||
{
|
||||
struct bpf_insn_access_aux info = {
|
||||
.reg_type = *reg_type,
|
||||
@ -2782,10 +2785,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
|
||||
*/
|
||||
*reg_type = info.reg_type;
|
||||
|
||||
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
|
||||
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
|
||||
*btf = info.btf;
|
||||
*btf_id = info.btf_id;
|
||||
else
|
||||
} else {
|
||||
env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
|
||||
}
|
||||
/* remember the offset of last byte accessed in ctx */
|
||||
if (env->prog->aux->max_ctx_offset < off + size)
|
||||
env->prog->aux->max_ctx_offset = off + size;
|
||||
@ -3297,8 +3302,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
|
||||
int value_regno)
|
||||
{
|
||||
struct bpf_reg_state *reg = regs + regno;
|
||||
const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
|
||||
const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
|
||||
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
|
||||
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
|
||||
u32 btf_id;
|
||||
int ret;
|
||||
|
||||
@ -3319,23 +3324,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
|
||||
}
|
||||
|
||||
if (env->ops->btf_struct_access) {
|
||||
ret = env->ops->btf_struct_access(&env->log, t, off, size,
|
||||
atype, &btf_id);
|
||||
ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
|
||||
off, size, atype, &btf_id);
|
||||
} else {
|
||||
if (atype != BPF_READ) {
|
||||
verbose(env, "only read is supported\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
ret = btf_struct_access(&env->log, t, off, size, atype,
|
||||
&btf_id);
|
||||
ret = btf_struct_access(&env->log, reg->btf, t, off, size,
|
||||
atype, &btf_id);
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (atype == BPF_READ && value_regno >= 0)
|
||||
mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
|
||||
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -3385,12 +3390,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
|
||||
ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (value_regno >= 0)
|
||||
mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
|
||||
mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -3466,6 +3471,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
} else if (reg->type == PTR_TO_CTX) {
|
||||
enum bpf_reg_type reg_type = SCALAR_VALUE;
|
||||
struct btf *btf = NULL;
|
||||
u32 btf_id = 0;
|
||||
|
||||
if (t == BPF_WRITE && value_regno >= 0 &&
|
||||
@ -3478,7 +3484,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id);
|
||||
err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id);
|
||||
if (err)
|
||||
verbose_linfo(env, insn_idx, "; ");
|
||||
if (!err && t == BPF_READ && value_regno >= 0) {
|
||||
@ -3500,8 +3506,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
*/
|
||||
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
|
||||
if (reg_type == PTR_TO_BTF_ID ||
|
||||
reg_type == PTR_TO_BTF_ID_OR_NULL)
|
||||
reg_type == PTR_TO_BTF_ID_OR_NULL) {
|
||||
regs[value_regno].btf = btf;
|
||||
regs[value_regno].btf_id = btf_id;
|
||||
}
|
||||
}
|
||||
regs[value_regno].type = reg_type;
|
||||
}
|
||||
@ -4118,11 +4126,11 @@ found:
|
||||
arg_btf_id = compatible->btf_id;
|
||||
}
|
||||
|
||||
if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
|
||||
*arg_btf_id)) {
|
||||
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
|
||||
btf_vmlinux, *arg_btf_id)) {
|
||||
verbose(env, "R%d is of type %s but %s is expected\n",
|
||||
regno, kernel_type_name(reg->btf_id),
|
||||
kernel_type_name(*arg_btf_id));
|
||||
regno, kernel_type_name(reg->btf, reg->btf_id),
|
||||
kernel_type_name(btf_vmlinux, *arg_btf_id));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
@ -4244,6 +4252,7 @@ skip_type_check:
|
||||
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
|
||||
return -EACCES;
|
||||
}
|
||||
meta->ret_btf = reg->btf;
|
||||
meta->ret_btf_id = reg->btf_id;
|
||||
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
|
||||
if (meta->func_id == BPF_FUNC_spin_lock) {
|
||||
@ -5190,16 +5199,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
|
||||
const struct btf_type *t;
|
||||
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
|
||||
t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
|
||||
if (!btf_type_is_struct(t)) {
|
||||
u32 tsize;
|
||||
const struct btf_type *ret;
|
||||
const char *tname;
|
||||
|
||||
/* resolve the type size of ksym. */
|
||||
ret = btf_resolve_size(btf_vmlinux, t, &tsize);
|
||||
ret = btf_resolve_size(meta.ret_btf, t, &tsize);
|
||||
if (IS_ERR(ret)) {
|
||||
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
|
||||
tname = btf_name_by_offset(meta.ret_btf, t->name_off);
|
||||
verbose(env, "unable to resolve the size of type '%s': %ld\n",
|
||||
tname, PTR_ERR(ret));
|
||||
return -EINVAL;
|
||||
@ -5212,6 +5221,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
|
||||
regs[BPF_REG_0].type =
|
||||
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
|
||||
PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
|
||||
regs[BPF_REG_0].btf = meta.ret_btf;
|
||||
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
|
||||
}
|
||||
} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
|
||||
@ -5228,6 +5238,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
|
||||
fn->ret_type, func_id_name(func_id), func_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* current BPF helper definitions are only coming from
|
||||
* built-in code with type IDs from vmlinux BTF
|
||||
*/
|
||||
regs[BPF_REG_0].btf = btf_vmlinux;
|
||||
regs[BPF_REG_0].btf_id = ret_btf_id;
|
||||
} else {
|
||||
verbose(env, "unknown return type %d of func %s#%d\n",
|
||||
@ -5627,7 +5641,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
|
||||
if (reg_is_pkt_pointer(ptr_reg)) {
|
||||
dst_reg->id = ++env->id_gen;
|
||||
/* something was added to pkt_ptr, set range to zero */
|
||||
dst_reg->raw = 0;
|
||||
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
|
||||
}
|
||||
break;
|
||||
case BPF_SUB:
|
||||
@ -5692,7 +5706,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
|
||||
dst_reg->id = ++env->id_gen;
|
||||
/* something was added to pkt_ptr, set range to zero */
|
||||
if (smin_val < 0)
|
||||
dst_reg->raw = 0;
|
||||
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
|
||||
}
|
||||
break;
|
||||
case BPF_AND:
|
||||
@ -7744,6 +7758,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
|
||||
break;
|
||||
case PTR_TO_BTF_ID:
|
||||
case PTR_TO_PERCPU_BTF_ID:
|
||||
dst_reg->btf = aux->btf_var.btf;
|
||||
dst_reg->btf_id = aux->btf_var.btf_id;
|
||||
break;
|
||||
default:
|
||||
@ -8058,6 +8073,11 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx)
|
||||
env->insn_aux_data[idx].prune_point = true;
|
||||
}
|
||||
|
||||
enum {
|
||||
DONE_EXPLORING = 0,
|
||||
KEEP_EXPLORING = 1,
|
||||
};
|
||||
|
||||
/* t, w, e - match pseudo-code above:
|
||||
* t - index of current instruction
|
||||
* w - next instruction
|
||||
@ -8070,10 +8090,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
|
||||
int *insn_state = env->cfg.insn_state;
|
||||
|
||||
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
|
||||
return 0;
|
||||
return DONE_EXPLORING;
|
||||
|
||||
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
|
||||
return 0;
|
||||
return DONE_EXPLORING;
|
||||
|
||||
if (w < 0 || w >= env->prog->len) {
|
||||
verbose_linfo(env, t, "%d: ", t);
|
||||
@ -8092,10 +8112,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
|
||||
if (env->cfg.cur_stack >= env->prog->len)
|
||||
return -E2BIG;
|
||||
insn_stack[env->cfg.cur_stack++] = w;
|
||||
return 1;
|
||||
return KEEP_EXPLORING;
|
||||
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
|
||||
if (loop_ok && env->bpf_capable)
|
||||
return 0;
|
||||
return DONE_EXPLORING;
|
||||
verbose_linfo(env, t, "%d: ", t);
|
||||
verbose_linfo(env, w, "%d: ", w);
|
||||
verbose(env, "back-edge from insn %d to %d\n", t, w);
|
||||
@ -8107,7 +8127,74 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
|
||||
verbose(env, "insn state internal bug\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
return 0;
|
||||
return DONE_EXPLORING;
|
||||
}
|
||||
|
||||
/* Visits the instruction at index t and returns one of the following:
|
||||
* < 0 - an error occurred
|
||||
* DONE_EXPLORING - the instruction was fully explored
|
||||
* KEEP_EXPLORING - there is still work to be done before it is fully explored
|
||||
*/
|
||||
static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_insn *insns = env->prog->insnsi;
|
||||
int ret;
|
||||
|
||||
/* All non-branch instructions have a single fall-through edge. */
|
||||
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
|
||||
BPF_CLASS(insns[t].code) != BPF_JMP32)
|
||||
return push_insn(t, t + 1, FALLTHROUGH, env, false);
|
||||
|
||||
switch (BPF_OP(insns[t].code)) {
|
||||
case BPF_EXIT:
|
||||
return DONE_EXPLORING;
|
||||
|
||||
case BPF_CALL:
|
||||
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (t + 1 < insn_cnt)
|
||||
init_explored_state(env, t + 1);
|
||||
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
|
||||
init_explored_state(env, t);
|
||||
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
|
||||
env, false);
|
||||
}
|
||||
return ret;
|
||||
|
||||
case BPF_JA:
|
||||
if (BPF_SRC(insns[t].code) != BPF_K)
|
||||
return -EINVAL;
|
||||
|
||||
/* unconditional jump with single edge */
|
||||
ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
|
||||
true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* unconditional jmp is not a good pruning point,
|
||||
* but it's marked, since backtracking needs
|
||||
* to record jmp history in is_state_visited().
|
||||
*/
|
||||
init_explored_state(env, t + insns[t].off + 1);
|
||||
/* tell verifier to check for equivalent states
|
||||
* after every call and jump
|
||||
*/
|
||||
if (t + 1 < insn_cnt)
|
||||
init_explored_state(env, t + 1);
|
||||
|
||||
return ret;
|
||||
|
||||
default:
|
||||
/* conditional jump with two edges */
|
||||
init_explored_state(env, t);
|
||||
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
|
||||
}
|
||||
}
|
||||
|
||||
/* non-recursive depth-first-search to detect loops in BPF program
|
||||
@ -8115,11 +8202,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
|
||||
*/
|
||||
static int check_cfg(struct bpf_verifier_env *env)
|
||||
{
|
||||
struct bpf_insn *insns = env->prog->insnsi;
|
||||
int insn_cnt = env->prog->len;
|
||||
int *insn_stack, *insn_state;
|
||||
int ret = 0;
|
||||
int i, t;
|
||||
int i;
|
||||
|
||||
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
|
||||
if (!insn_state)
|
||||
@ -8135,92 +8221,32 @@ static int check_cfg(struct bpf_verifier_env *env)
|
||||
insn_stack[0] = 0; /* 0 is the first instruction */
|
||||
env->cfg.cur_stack = 1;
|
||||
|
||||
peek_stack:
|
||||
if (env->cfg.cur_stack == 0)
|
||||
goto check_state;
|
||||
t = insn_stack[env->cfg.cur_stack - 1];
|
||||
while (env->cfg.cur_stack > 0) {
|
||||
int t = insn_stack[env->cfg.cur_stack - 1];
|
||||
|
||||
if (BPF_CLASS(insns[t].code) == BPF_JMP ||
|
||||
BPF_CLASS(insns[t].code) == BPF_JMP32) {
|
||||
u8 opcode = BPF_OP(insns[t].code);
|
||||
|
||||
if (opcode == BPF_EXIT) {
|
||||
goto mark_explored;
|
||||
} else if (opcode == BPF_CALL) {
|
||||
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
if (t + 1 < insn_cnt)
|
||||
init_explored_state(env, t + 1);
|
||||
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
|
||||
init_explored_state(env, t);
|
||||
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
|
||||
env, false);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
ret = visit_insn(t, insn_cnt, env);
|
||||
switch (ret) {
|
||||
case DONE_EXPLORING:
|
||||
insn_state[t] = EXPLORED;
|
||||
env->cfg.cur_stack--;
|
||||
break;
|
||||
case KEEP_EXPLORING:
|
||||
break;
|
||||
default:
|
||||
if (ret > 0) {
|
||||
verbose(env, "visit_insn internal bug\n");
|
||||
ret = -EFAULT;
|
||||
}
|
||||
} else if (opcode == BPF_JA) {
|
||||
if (BPF_SRC(insns[t].code) != BPF_K) {
|
||||
ret = -EINVAL;
|
||||
goto err_free;
|
||||
}
|
||||
/* unconditional jump with single edge */
|
||||
ret = push_insn(t, t + insns[t].off + 1,
|
||||
FALLTHROUGH, env, true);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
/* unconditional jmp is not a good pruning point,
|
||||
* but it's marked, since backtracking needs
|
||||
* to record jmp history in is_state_visited().
|
||||
*/
|
||||
init_explored_state(env, t + insns[t].off + 1);
|
||||
/* tell verifier to check for equivalent states
|
||||
* after every call and jump
|
||||
*/
|
||||
if (t + 1 < insn_cnt)
|
||||
init_explored_state(env, t + 1);
|
||||
} else {
|
||||
/* conditional jump with two edges */
|
||||
init_explored_state(env, t);
|
||||
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
|
||||
ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
}
|
||||
} else {
|
||||
/* all other non-branch instructions with single
|
||||
* fall-through edge
|
||||
*/
|
||||
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
|
||||
if (ret == 1)
|
||||
goto peek_stack;
|
||||
else if (ret < 0)
|
||||
goto err_free;
|
||||
}
|
||||
}
|
||||
|
||||
mark_explored:
|
||||
insn_state[t] = EXPLORED;
|
||||
if (env->cfg.cur_stack-- <= 0) {
|
||||
if (env->cfg.cur_stack < 0) {
|
||||
verbose(env, "pop stack internal bug\n");
|
||||
ret = -EFAULT;
|
||||
goto err_free;
|
||||
}
|
||||
goto peek_stack;
|
||||
|
||||
check_state:
|
||||
for (i = 0; i < insn_cnt; i++) {
|
||||
if (insn_state[i] != EXPLORED) {
|
||||
verbose(env, "unreachable insn %d\n", i);
|
||||
@ -9740,6 +9766,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
|
||||
t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
|
||||
if (percpu) {
|
||||
aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
|
||||
aux->btf_var.btf = btf_vmlinux;
|
||||
aux->btf_var.btf_id = type;
|
||||
} else if (!btf_type_is_struct(t)) {
|
||||
const struct btf_type *ret;
|
||||
@ -9758,6 +9785,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
|
||||
aux->btf_var.mem_size = tsize;
|
||||
} else {
|
||||
aux->btf_var.reg_type = PTR_TO_BTF_ID;
|
||||
aux->btf_var.btf = btf_vmlinux;
|
||||
aux->btf_var.btf_id = type;
|
||||
}
|
||||
return 0;
|
||||
@ -11610,7 +11638,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
|
||||
bpf_log(log, "Tracing programs must provide btf_id\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
|
||||
btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
|
||||
if (!btf) {
|
||||
bpf_log(log,
|
||||
"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
|
||||
@ -11886,7 +11914,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
|
||||
return ret;
|
||||
}
|
||||
|
||||
key = bpf_trampoline_compute_key(tgt_prog, btf_id);
|
||||
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
|
||||
tr = bpf_trampoline_get(key, &tgt_info);
|
||||
if (!tr)
|
||||
return -ENOMEM;
|
||||
|
@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
|
||||
|
||||
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
|
||||
/*
|
||||
* If memcg_kmem_charge_page() fails, page->mem_cgroup
|
||||
* pointer is NULL, and memcg_kmem_uncharge_page() in
|
||||
* free_thread_stack() will ignore this page.
|
||||
* If memcg_kmem_charge_page() fails, page's
|
||||
* memory cgroup pointer is NULL, and
|
||||
* memcg_kmem_uncharge_page() in free_thread_stack()
|
||||
* will ignore this page.
|
||||
*/
|
||||
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
|
||||
0);
|
||||
|
@ -3709,6 +3709,10 @@ static noinline int do_init_module(struct module *mod)
|
||||
mod->init_layout.ro_size = 0;
|
||||
mod->init_layout.ro_after_init_size = 0;
|
||||
mod->init_layout.text_size = 0;
|
||||
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
|
||||
/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
|
||||
mod->btf_data = NULL;
|
||||
#endif
|
||||
/*
|
||||
* We want to free module_init, but be aware that kallsyms may be
|
||||
* walking this with preempt disabled. In all the failure paths, we
|
||||
|
@ -1290,6 +1290,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
return &bpf_ktime_get_ns_proto;
|
||||
case BPF_FUNC_ktime_get_boot_ns:
|
||||
return &bpf_ktime_get_boot_ns_proto;
|
||||
case BPF_FUNC_ktime_get_coarse_ns:
|
||||
return &bpf_ktime_get_coarse_ns_proto;
|
||||
case BPF_FUNC_tail_call:
|
||||
return &bpf_tail_call_proto;
|
||||
case BPF_FUNC_get_current_pid_tgid:
|
||||
@ -2068,10 +2070,12 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
|
||||
|
||||
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
|
||||
{
|
||||
struct module *mod = __module_address((unsigned long)btp);
|
||||
struct module *mod;
|
||||
|
||||
if (mod)
|
||||
module_put(mod);
|
||||
preempt_disable();
|
||||
mod = __module_address((unsigned long)btp);
|
||||
module_put(mod);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static __always_inline
|
||||
|
@ -182,8 +182,8 @@ hex_only:
|
||||
pr_warn("page dumped because: %s\n", reason);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (!page_poisoned && page->mem_cgroup)
|
||||
pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
|
||||
if (!page_poisoned && page->memcg_data)
|
||||
pr_warn("pages's memcg:%lx\n", page->memcg_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
|
||||
{
|
||||
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(compound_head(page));
|
||||
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
|
||||
|
||||
if (memcg)
|
||||
@ -2764,7 +2764,7 @@ void deferred_split_huge_page(struct page *page)
|
||||
{
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(page);
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(compound_head(page));
|
||||
#endif
|
||||
unsigned long flags;
|
||||
|
||||
|
139
mm/memcontrol.c
139
mm/memcontrol.c
@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
memcg = page->mem_cgroup;
|
||||
memcg = page_memcg(page);
|
||||
|
||||
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
memcg = root_mem_cgroup;
|
||||
@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
|
||||
unsigned long ino = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
memcg = page->mem_cgroup;
|
||||
|
||||
/*
|
||||
* The lowest bit set means that memcg isn't a valid
|
||||
* memcg pointer, but a obj_cgroups pointer.
|
||||
* In this case the page is shared and doesn't belong
|
||||
* to any specific memory cgroup.
|
||||
*/
|
||||
if ((unsigned long) memcg & 0x1UL)
|
||||
memcg = NULL;
|
||||
memcg = page_memcg_check(page);
|
||||
|
||||
while (memcg && !(memcg->css.flags & CSS_ONLINE))
|
||||
memcg = parent_mem_cgroup(memcg);
|
||||
@ -1055,7 +1046,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm);
|
||||
*/
|
||||
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
|
||||
{
|
||||
struct mem_cgroup *memcg = page->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return NULL;
|
||||
@ -1354,7 +1345,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcg = page->mem_cgroup;
|
||||
memcg = page_memcg(page);
|
||||
/*
|
||||
* Swapcache readahead pages are added to the LRU - and
|
||||
* possibly migrated - before they are charged.
|
||||
@ -2114,7 +2105,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
|
||||
}
|
||||
|
||||
/**
|
||||
* lock_page_memcg - lock a page->mem_cgroup binding
|
||||
* lock_page_memcg - lock a page and memcg binding
|
||||
* @page: the page
|
||||
*
|
||||
* This function protects unlocked LRU pages from being moved to
|
||||
@ -2146,7 +2137,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
|
||||
if (mem_cgroup_disabled())
|
||||
return NULL;
|
||||
again:
|
||||
memcg = head->mem_cgroup;
|
||||
memcg = page_memcg(head);
|
||||
if (unlikely(!memcg))
|
||||
return NULL;
|
||||
|
||||
@ -2154,7 +2145,7 @@ again:
|
||||
return memcg;
|
||||
|
||||
spin_lock_irqsave(&memcg->move_lock, flags);
|
||||
if (memcg != head->mem_cgroup) {
|
||||
if (memcg != page_memcg(head)) {
|
||||
spin_unlock_irqrestore(&memcg->move_lock, flags);
|
||||
goto again;
|
||||
}
|
||||
@ -2192,14 +2183,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
|
||||
}
|
||||
|
||||
/**
|
||||
* unlock_page_memcg - unlock a page->mem_cgroup binding
|
||||
* unlock_page_memcg - unlock a page and memcg binding
|
||||
* @page: the page
|
||||
*/
|
||||
void unlock_page_memcg(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
__unlock_page_memcg(head->mem_cgroup);
|
||||
__unlock_page_memcg(page_memcg(head));
|
||||
}
|
||||
EXPORT_SYMBOL(unlock_page_memcg);
|
||||
|
||||
@ -2889,7 +2880,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
|
||||
|
||||
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
|
||||
{
|
||||
VM_BUG_ON_PAGE(page->mem_cgroup, page);
|
||||
VM_BUG_ON_PAGE(page_memcg(page), page);
|
||||
/*
|
||||
* Any of the following ensures page->mem_cgroup stability:
|
||||
*
|
||||
@ -2898,7 +2889,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
*/
|
||||
page->mem_cgroup = memcg;
|
||||
page->memcg_data = (unsigned long)memcg;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
@ -2913,8 +2904,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
|
||||
if (!vec)
|
||||
return -ENOMEM;
|
||||
|
||||
if (cmpxchg(&page->obj_cgroups, NULL,
|
||||
(struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
|
||||
if (!set_page_objcgs(page, vec))
|
||||
kfree(vec);
|
||||
else
|
||||
kmemleak_not_leak(vec);
|
||||
@ -2925,6 +2915,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
|
||||
/*
|
||||
* Returns a pointer to the memory cgroup to which the kernel object is charged.
|
||||
*
|
||||
* A passed kernel object can be a slab object or a generic kernel page, so
|
||||
* different mechanisms for getting the memory cgroup pointer should be used.
|
||||
* In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
|
||||
* can not know for sure how the kernel object is implemented.
|
||||
* mem_cgroup_from_obj() can be safely used in such cases.
|
||||
*
|
||||
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
|
||||
* cgroup_mutex, etc.
|
||||
*/
|
||||
@ -2937,36 +2933,31 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
|
||||
|
||||
page = virt_to_head_page(p);
|
||||
|
||||
/*
|
||||
* If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
|
||||
* or a pointer to obj_cgroup vector. In the latter case the lowest
|
||||
* bit of the pointer is set.
|
||||
* The page->mem_cgroup pointer can be asynchronously changed
|
||||
* from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
|
||||
* from a valid memcg pointer to objcg vector or back.
|
||||
*/
|
||||
if (!page->mem_cgroup)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Slab objects are accounted individually, not per-page.
|
||||
* Memcg membership data for each individual object is saved in
|
||||
* the page->obj_cgroups.
|
||||
*/
|
||||
if (page_has_obj_cgroups(page)) {
|
||||
if (page_objcgs_check(page)) {
|
||||
struct obj_cgroup *objcg;
|
||||
unsigned int off;
|
||||
|
||||
off = obj_to_index(page->slab_cache, page, p);
|
||||
objcg = page_obj_cgroups(page)[off];
|
||||
objcg = page_objcgs(page)[off];
|
||||
if (objcg)
|
||||
return obj_cgroup_memcg(objcg);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* All other pages use page->mem_cgroup */
|
||||
return page->mem_cgroup;
|
||||
/*
|
||||
* page_memcg_check() is used here, because page_has_obj_cgroups()
|
||||
* check above could fail because the object cgroups vector wasn't set
|
||||
* at that moment, but it can be set concurrently.
|
||||
* page_memcg_check(page) will guarantee that a proper memory
|
||||
* cgroup pointer or NULL will be returned.
|
||||
*/
|
||||
return page_memcg_check(page);
|
||||
}
|
||||
|
||||
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
|
||||
@ -3104,8 +3095,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
|
||||
if (memcg && !mem_cgroup_is_root(memcg)) {
|
||||
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
|
||||
if (!ret) {
|
||||
page->mem_cgroup = memcg;
|
||||
__SetPageKmemcg(page);
|
||||
page->memcg_data = (unsigned long)memcg |
|
||||
MEMCG_DATA_KMEM;
|
||||
return 0;
|
||||
}
|
||||
css_put(&memcg->css);
|
||||
@ -3120,7 +3111,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
|
||||
*/
|
||||
void __memcg_kmem_uncharge_page(struct page *page, int order)
|
||||
{
|
||||
struct mem_cgroup *memcg = page->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
unsigned int nr_pages = 1 << order;
|
||||
|
||||
if (!memcg)
|
||||
@ -3128,12 +3119,8 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
|
||||
|
||||
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
|
||||
__memcg_kmem_uncharge(memcg, nr_pages);
|
||||
page->mem_cgroup = NULL;
|
||||
page->memcg_data = 0;
|
||||
css_put(&memcg->css);
|
||||
|
||||
/* slab pages do not have PageKmemcg flag set */
|
||||
if (PageKmemcg(page))
|
||||
__ClearPageKmemcg(page);
|
||||
}
|
||||
|
||||
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
|
||||
@ -3279,7 +3266,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
|
||||
*/
|
||||
void mem_cgroup_split_huge_fixup(struct page *head)
|
||||
{
|
||||
struct mem_cgroup *memcg = head->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(head);
|
||||
int i;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
@ -3287,7 +3274,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
|
||||
|
||||
for (i = 1; i < HPAGE_PMD_NR; i++) {
|
||||
css_get(&memcg->css);
|
||||
head[i].mem_cgroup = memcg;
|
||||
head[i].memcg_data = (unsigned long)memcg;
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
@ -4669,7 +4656,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
||||
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
struct mem_cgroup *memcg = page->mem_cgroup;
|
||||
struct mem_cgroup *memcg = page_memcg(page);
|
||||
struct memcg_cgwb_frn *frn;
|
||||
u64 now = get_jiffies_64();
|
||||
u64 oldest_at = now;
|
||||
@ -5646,14 +5633,14 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
|
||||
/*
|
||||
* Prevent mem_cgroup_migrate() from looking at
|
||||
* page->mem_cgroup of its source page while we change it.
|
||||
* page's memory cgroup of its source page while we change it.
|
||||
*/
|
||||
ret = -EBUSY;
|
||||
if (!trylock_page(page))
|
||||
goto out;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (page->mem_cgroup != from)
|
||||
if (page_memcg(page) != from)
|
||||
goto out_unlock;
|
||||
|
||||
pgdat = page_pgdat(page);
|
||||
@ -5708,13 +5695,13 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
/*
|
||||
* All state has been migrated, let's switch to the new memcg.
|
||||
*
|
||||
* It is safe to change page->mem_cgroup here because the page
|
||||
* It is safe to change page's memcg here because the page
|
||||
* is referenced, charged, isolated, and locked: we can't race
|
||||
* with (un)charging, migration, LRU putback, or anything else
|
||||
* that would rely on a stable page->mem_cgroup.
|
||||
* that would rely on a stable page's memory cgroup.
|
||||
*
|
||||
* Note that lock_page_memcg is a memcg lock, not a page lock,
|
||||
* to save space. As soon as we switch page->mem_cgroup to a
|
||||
* to save space. As soon as we switch page's memory cgroup to a
|
||||
* new memcg that isn't locked, the above state can change
|
||||
* concurrently again. Make sure we're truly done with it.
|
||||
*/
|
||||
@ -5723,7 +5710,7 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
css_get(&to->css);
|
||||
css_put(&from->css);
|
||||
|
||||
page->mem_cgroup = to;
|
||||
page->memcg_data = (unsigned long)to;
|
||||
|
||||
__unlock_page_memcg(from);
|
||||
|
||||
@ -5789,7 +5776,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
|
||||
* mem_cgroup_move_account() checks the page is valid or
|
||||
* not under LRU exclusion.
|
||||
*/
|
||||
if (page->mem_cgroup == mc.from) {
|
||||
if (page_memcg(page) == mc.from) {
|
||||
ret = MC_TARGET_PAGE;
|
||||
if (is_device_private_page(page))
|
||||
ret = MC_TARGET_DEVICE;
|
||||
@ -5833,7 +5820,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
|
||||
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
|
||||
if (!(mc.flags & MOVE_ANON))
|
||||
return ret;
|
||||
if (page->mem_cgroup == mc.from) {
|
||||
if (page_memcg(page) == mc.from) {
|
||||
ret = MC_TARGET_PAGE;
|
||||
if (target) {
|
||||
get_page(page);
|
||||
@ -6779,12 +6766,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
|
||||
/*
|
||||
* Every swap fault against a single page tries to charge the
|
||||
* page, bail as early as possible. shmem_unuse() encounters
|
||||
* already charged pages, too. page->mem_cgroup is protected
|
||||
* by the page lock, which serializes swap cache removal, which
|
||||
* in turn serializes uncharging.
|
||||
* already charged pages, too. page and memcg binding is
|
||||
* protected by the page lock, which serializes swap cache
|
||||
* removal, which in turn serializes uncharging.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
if (compound_head(page)->mem_cgroup)
|
||||
if (page_memcg(compound_head(page)))
|
||||
goto out;
|
||||
|
||||
id = lookup_swap_cgroup_id(ent);
|
||||
@ -6868,21 +6855,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
|
||||
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
if (!page->mem_cgroup)
|
||||
if (!page_memcg(page))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Nobody should be changing or seriously looking at
|
||||
* page->mem_cgroup at this point, we have fully
|
||||
* page_memcg(page) at this point, we have fully
|
||||
* exclusive access to the page.
|
||||
*/
|
||||
|
||||
if (ug->memcg != page->mem_cgroup) {
|
||||
if (ug->memcg != page_memcg(page)) {
|
||||
if (ug->memcg) {
|
||||
uncharge_batch(ug);
|
||||
uncharge_gather_clear(ug);
|
||||
}
|
||||
ug->memcg = page->mem_cgroup;
|
||||
ug->memcg = page_memcg(page);
|
||||
|
||||
/* pairs with css_put in uncharge_batch */
|
||||
css_get(&ug->memcg->css);
|
||||
@ -6891,15 +6878,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
|
||||
nr_pages = compound_nr(page);
|
||||
ug->nr_pages += nr_pages;
|
||||
|
||||
if (!PageKmemcg(page)) {
|
||||
ug->pgpgout++;
|
||||
} else {
|
||||
if (PageMemcgKmem(page))
|
||||
ug->nr_kmem += nr_pages;
|
||||
__ClearPageKmemcg(page);
|
||||
}
|
||||
else
|
||||
ug->pgpgout++;
|
||||
|
||||
ug->dummy_page = page;
|
||||
page->mem_cgroup = NULL;
|
||||
page->memcg_data = 0;
|
||||
css_put(&ug->memcg->css);
|
||||
}
|
||||
|
||||
@ -6942,7 +6927,7 @@ void mem_cgroup_uncharge(struct page *page)
|
||||
return;
|
||||
|
||||
/* Don't touch page->lru of any random page, pre-check: */
|
||||
if (!page->mem_cgroup)
|
||||
if (!page_memcg(page))
|
||||
return;
|
||||
|
||||
uncharge_gather_clear(&ug);
|
||||
@ -6992,11 +6977,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
||||
return;
|
||||
|
||||
/* Page cache replacement: new page already charged? */
|
||||
if (newpage->mem_cgroup)
|
||||
if (page_memcg(newpage))
|
||||
return;
|
||||
|
||||
/* Swapcache readahead pages can get replaced before being charged */
|
||||
memcg = oldpage->mem_cgroup;
|
||||
memcg = page_memcg(oldpage);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
@ -7191,7 +7176,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return;
|
||||
|
||||
memcg = page->mem_cgroup;
|
||||
memcg = page_memcg(page);
|
||||
|
||||
/* Readahead page, never charged */
|
||||
if (!memcg)
|
||||
@ -7212,7 +7197,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
|
||||
VM_BUG_ON_PAGE(oldid, page);
|
||||
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
|
||||
|
||||
page->mem_cgroup = NULL;
|
||||
page->memcg_data = 0;
|
||||
|
||||
if (!mem_cgroup_is_root(memcg))
|
||||
page_counter_uncharge(&memcg->memory, nr_entries);
|
||||
@ -7255,7 +7240,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
|
||||
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return 0;
|
||||
|
||||
memcg = page->mem_cgroup;
|
||||
memcg = page_memcg(page);
|
||||
|
||||
/* Readahead page, never charged */
|
||||
if (!memcg)
|
||||
@ -7336,7 +7321,7 @@ bool mem_cgroup_swap_full(struct page *page)
|
||||
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
return false;
|
||||
|
||||
memcg = page->mem_cgroup;
|
||||
memcg = page_memcg(page);
|
||||
if (!memcg)
|
||||
return false;
|
||||
|
||||
|
@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page,
|
||||
if (unlikely((unsigned long)page->mapping |
|
||||
page_ref_count(page) |
|
||||
#ifdef CONFIG_MEMCG
|
||||
(unsigned long)page->mem_cgroup |
|
||||
(unsigned long)page_memcg(page) |
|
||||
#endif
|
||||
(page->flags & check_flags)))
|
||||
return false;
|
||||
@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
|
||||
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
|
||||
}
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (unlikely(page->mem_cgroup))
|
||||
if (unlikely(page_memcg(page)))
|
||||
bad_reason = "page still charged to cgroup";
|
||||
#endif
|
||||
return bad_reason;
|
||||
@ -1214,7 +1214,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
* Do not let hwpoison pages hit pcplists/buddy
|
||||
* Untie memcg state and reset page's owner
|
||||
*/
|
||||
if (memcg_kmem_enabled() && PageKmemcg(page))
|
||||
if (memcg_kmem_enabled() && PageMemcgKmem(page))
|
||||
__memcg_kmem_uncharge_page(page, order);
|
||||
reset_page_owner(page, order);
|
||||
return false;
|
||||
@ -1244,7 +1244,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
}
|
||||
if (PageMappingFlags(page))
|
||||
page->mapping = NULL;
|
||||
if (memcg_kmem_enabled() && PageKmemcg(page))
|
||||
if (memcg_kmem_enabled() && PageMemcgKmem(page))
|
||||
__memcg_kmem_uncharge_page(page, order);
|
||||
if (check_free)
|
||||
bad += check_free_page(page);
|
||||
|
@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page)
|
||||
static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (!page->mem_cgroup)
|
||||
memcg = page_memcg(page);
|
||||
if (!memcg)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
|
||||
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
|
||||
bio_associate_blkg_from_css(bio, css);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
38
mm/slab.h
38
mm/slab.h
@ -239,30 +239,13 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
|
||||
{
|
||||
/*
|
||||
* page->mem_cgroup and page->obj_cgroups are sharing the same
|
||||
* space. To distinguish between them in case we don't know for sure
|
||||
* that the page is a slab page (e.g. page_cgroup_ino()), let's
|
||||
* always set the lowest bit of obj_cgroups.
|
||||
*/
|
||||
return (struct obj_cgroup **)
|
||||
((unsigned long)page->obj_cgroups & ~0x1UL);
|
||||
}
|
||||
|
||||
static inline bool page_has_obj_cgroups(struct page *page)
|
||||
{
|
||||
return ((unsigned long)page->obj_cgroups & 0x1UL);
|
||||
}
|
||||
|
||||
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
|
||||
gfp_t gfp);
|
||||
|
||||
static inline void memcg_free_page_obj_cgroups(struct page *page)
|
||||
{
|
||||
kfree(page_obj_cgroups(page));
|
||||
page->obj_cgroups = NULL;
|
||||
kfree(page_objcgs(page));
|
||||
page->memcg_data = 0;
|
||||
}
|
||||
|
||||
static inline size_t obj_full_size(struct kmem_cache *s)
|
||||
@ -323,7 +306,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
|
||||
if (likely(p[i])) {
|
||||
page = virt_to_head_page(p[i]);
|
||||
|
||||
if (!page_has_obj_cgroups(page) &&
|
||||
if (!page_objcgs(page) &&
|
||||
memcg_alloc_page_obj_cgroups(page, s, flags)) {
|
||||
obj_cgroup_uncharge(objcg, obj_full_size(s));
|
||||
continue;
|
||||
@ -331,7 +314,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
|
||||
|
||||
off = obj_to_index(s, page, p[i]);
|
||||
obj_cgroup_get(objcg);
|
||||
page_obj_cgroups(page)[off] = objcg;
|
||||
page_objcgs(page)[off] = objcg;
|
||||
mod_objcg_state(objcg, page_pgdat(page),
|
||||
cache_vmstat_idx(s), obj_full_size(s));
|
||||
} else {
|
||||
@ -345,6 +328,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
void **p, int objects)
|
||||
{
|
||||
struct kmem_cache *s;
|
||||
struct obj_cgroup **objcgs;
|
||||
struct obj_cgroup *objcg;
|
||||
struct page *page;
|
||||
unsigned int off;
|
||||
@ -358,7 +342,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
continue;
|
||||
|
||||
page = virt_to_head_page(p[i]);
|
||||
if (!page_has_obj_cgroups(page))
|
||||
objcgs = page_objcgs(page);
|
||||
if (!objcgs)
|
||||
continue;
|
||||
|
||||
if (!s_orig)
|
||||
@ -367,11 +352,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
s = s_orig;
|
||||
|
||||
off = obj_to_index(s, page, p[i]);
|
||||
objcg = page_obj_cgroups(page)[off];
|
||||
objcg = objcgs[off];
|
||||
if (!objcg)
|
||||
continue;
|
||||
|
||||
page_obj_cgroups(page)[off] = NULL;
|
||||
objcgs[off] = NULL;
|
||||
obj_cgroup_uncharge(objcg, obj_full_size(s));
|
||||
mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
|
||||
-obj_full_size(s));
|
||||
@ -380,11 +365,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
}
|
||||
|
||||
#else /* CONFIG_MEMCG_KMEM */
|
||||
static inline bool page_has_obj_cgroups(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
|
||||
{
|
||||
return NULL;
|
||||
|
@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
|
||||
struct lruvec *lruvec;
|
||||
int memcgid;
|
||||
|
||||
/* Page is fully exclusive and pins page->mem_cgroup */
|
||||
/* Page is fully exclusive and pins page's memory cgroup pointer */
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_PAGE(page_count(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
|
@ -415,7 +415,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
|
||||
BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
|
||||
void *, value, u64, flags)
|
||||
{
|
||||
if (!in_serving_softirq() && !in_task())
|
||||
if (in_irq() || in_nmi())
|
||||
return (unsigned long)NULL;
|
||||
|
||||
return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
|
||||
@ -424,7 +424,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
|
||||
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
|
||||
struct sock *, sk)
|
||||
{
|
||||
if (!in_serving_softirq() && !in_task())
|
||||
if (in_irq() || in_nmi())
|
||||
return -EPERM;
|
||||
|
||||
return ____bpf_sk_storage_delete(map, sk);
|
||||
|
@ -6448,7 +6448,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
|
||||
|
||||
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
|
||||
|
||||
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
|
||||
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
|
||||
NAPIF_STATE_PREFER_BUSY_POLL);
|
||||
|
||||
/* If STATE_MISSED was set, leave STATE_SCHED set,
|
||||
* because we will call napi->poll() one more time.
|
||||
@ -6485,10 +6486,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
|
||||
|
||||
#if defined(CONFIG_NET_RX_BUSY_POLL)
|
||||
|
||||
#define BUSY_POLL_BUDGET 8
|
||||
|
||||
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
|
||||
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
|
||||
{
|
||||
if (!skip_schedule) {
|
||||
gro_normal_list(napi);
|
||||
__napi_schedule(napi);
|
||||
return;
|
||||
}
|
||||
|
||||
if (napi->gro_bitmask) {
|
||||
/* flush too old packets
|
||||
* If HZ < 1000, flush all packets.
|
||||
*/
|
||||
napi_gro_flush(napi, HZ >= 1000);
|
||||
}
|
||||
|
||||
gro_normal_list(napi);
|
||||
clear_bit(NAPI_STATE_SCHED, &napi->state);
|
||||
}
|
||||
|
||||
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
|
||||
u16 budget)
|
||||
{
|
||||
bool skip_schedule = false;
|
||||
unsigned long timeout;
|
||||
int rc;
|
||||
|
||||
/* Busy polling means there is a high chance device driver hard irq
|
||||
@ -6505,29 +6526,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
|
||||
|
||||
local_bh_disable();
|
||||
|
||||
if (prefer_busy_poll) {
|
||||
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
|
||||
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
|
||||
if (napi->defer_hard_irqs_count && timeout) {
|
||||
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
|
||||
skip_schedule = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* All we really want here is to re-enable device interrupts.
|
||||
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
|
||||
*/
|
||||
rc = napi->poll(napi, BUSY_POLL_BUDGET);
|
||||
rc = napi->poll(napi, budget);
|
||||
/* We can't gro_normal_list() here, because napi->poll() might have
|
||||
* rearmed the napi (napi_complete_done()) in which case it could
|
||||
* already be running on another CPU.
|
||||
*/
|
||||
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
|
||||
trace_napi_poll(napi, rc, budget);
|
||||
netpoll_poll_unlock(have_poll_lock);
|
||||
if (rc == BUSY_POLL_BUDGET) {
|
||||
/* As the whole budget was spent, we still own the napi so can
|
||||
* safely handle the rx_list.
|
||||
*/
|
||||
gro_normal_list(napi);
|
||||
__napi_schedule(napi);
|
||||
}
|
||||
if (rc == budget)
|
||||
__busy_poll_stop(napi, skip_schedule);
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
void napi_busy_loop(unsigned int napi_id,
|
||||
bool (*loop_end)(void *, unsigned long),
|
||||
void *loop_end_arg)
|
||||
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
|
||||
{
|
||||
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
|
||||
int (*napi_poll)(struct napi_struct *napi, int budget);
|
||||
@ -6555,17 +6580,23 @@ restart:
|
||||
* we avoid dirtying napi->state as much as we can.
|
||||
*/
|
||||
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
|
||||
NAPIF_STATE_IN_BUSY_POLL))
|
||||
NAPIF_STATE_IN_BUSY_POLL)) {
|
||||
if (prefer_busy_poll)
|
||||
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
|
||||
goto count;
|
||||
}
|
||||
if (cmpxchg(&napi->state, val,
|
||||
val | NAPIF_STATE_IN_BUSY_POLL |
|
||||
NAPIF_STATE_SCHED) != val)
|
||||
NAPIF_STATE_SCHED) != val) {
|
||||
if (prefer_busy_poll)
|
||||
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
|
||||
goto count;
|
||||
}
|
||||
have_poll_lock = netpoll_poll_lock(napi);
|
||||
napi_poll = napi->poll;
|
||||
}
|
||||
work = napi_poll(napi, BUSY_POLL_BUDGET);
|
||||
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
|
||||
work = napi_poll(napi, budget);
|
||||
trace_napi_poll(napi, work, budget);
|
||||
gro_normal_list(napi);
|
||||
count:
|
||||
if (work > 0)
|
||||
@ -6578,7 +6609,7 @@ count:
|
||||
|
||||
if (unlikely(need_resched())) {
|
||||
if (napi_poll)
|
||||
busy_poll_stop(napi, have_poll_lock);
|
||||
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
|
||||
preempt_enable();
|
||||
rcu_read_unlock();
|
||||
cond_resched();
|
||||
@ -6589,7 +6620,7 @@ count:
|
||||
cpu_relax();
|
||||
}
|
||||
if (napi_poll)
|
||||
busy_poll_stop(napi, have_poll_lock);
|
||||
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
|
||||
preempt_enable();
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
@ -6640,8 +6671,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
|
||||
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
|
||||
*/
|
||||
if (!napi_disable_pending(napi) &&
|
||||
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
|
||||
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
|
||||
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
|
||||
__napi_schedule_irqoff(napi);
|
||||
}
|
||||
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
@ -6699,6 +6732,7 @@ void napi_disable(struct napi_struct *n)
|
||||
|
||||
hrtimer_cancel(&n->timer);
|
||||
|
||||
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
|
||||
clear_bit(NAPI_STATE_DISABLE, &n->state);
|
||||
}
|
||||
EXPORT_SYMBOL(napi_disable);
|
||||
@ -6771,6 +6805,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* The NAPI context has more processing work, but busy-polling
|
||||
* is preferred. Exit early.
|
||||
*/
|
||||
if (napi_prefer_busy_poll(n)) {
|
||||
if (napi_complete_done(n, work)) {
|
||||
/* If timeout is not set, we need to make sure
|
||||
* that the NAPI is re-scheduled.
|
||||
*/
|
||||
napi_schedule(n);
|
||||
}
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (n->gro_bitmask) {
|
||||
/* flush too old packets
|
||||
* If HZ < 1000, flush all packets.
|
||||
@ -9753,7 +9800,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
|
||||
rx[i].dev = dev;
|
||||
|
||||
/* XDP RX-queue setup */
|
||||
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
|
||||
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
|
||||
if (err < 0)
|
||||
goto err_rxq_info;
|
||||
}
|
||||
|
@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
|
||||
tp->notsent_lowat = val;
|
||||
sk->sk_write_space(sk);
|
||||
break;
|
||||
case TCP_WINDOW_CLAMP:
|
||||
ret = tcp_set_window_clamp(sk, val);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
@ -6995,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
return &bpf_sk_storage_delete_proto;
|
||||
case BPF_FUNC_setsockopt:
|
||||
switch (prog->expected_attach_type) {
|
||||
case BPF_CGROUP_INET4_BIND:
|
||||
case BPF_CGROUP_INET6_BIND:
|
||||
case BPF_CGROUP_INET4_CONNECT:
|
||||
case BPF_CGROUP_INET6_CONNECT:
|
||||
return &bpf_sock_addr_setsockopt_proto;
|
||||
@ -7003,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
}
|
||||
case BPF_FUNC_getsockopt:
|
||||
switch (prog->expected_attach_type) {
|
||||
case BPF_CGROUP_INET4_BIND:
|
||||
case BPF_CGROUP_INET6_BIND:
|
||||
case BPF_CGROUP_INET4_CONNECT:
|
||||
case BPF_CGROUP_INET6_CONNECT:
|
||||
return &bpf_sock_addr_getsockopt_proto;
|
||||
|
@ -1159,6 +1159,22 @@ set_sndbuf:
|
||||
sk->sk_ll_usec = val;
|
||||
}
|
||||
break;
|
||||
case SO_PREFER_BUSY_POLL:
|
||||
if (valbool && !capable(CAP_NET_ADMIN))
|
||||
ret = -EPERM;
|
||||
else
|
||||
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
|
||||
break;
|
||||
case SO_BUSY_POLL_BUDGET:
|
||||
if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
} else {
|
||||
if (val < 0 || val > U16_MAX)
|
||||
ret = -EINVAL;
|
||||
else
|
||||
WRITE_ONCE(sk->sk_busy_poll_budget, val);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
|
||||
case SO_MAX_PACING_RATE:
|
||||
@ -1523,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
|
||||
case SO_BUSY_POLL:
|
||||
v.val = sk->sk_ll_usec;
|
||||
break;
|
||||
case SO_PREFER_BUSY_POLL:
|
||||
v.val = READ_ONCE(sk->sk_prefer_busy_poll);
|
||||
break;
|
||||
#endif
|
||||
|
||||
case SO_MAX_PACING_RATE:
|
||||
|
@ -27,8 +27,6 @@ struct bpf_stab {
|
||||
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_stab *stab;
|
||||
u64 cost;
|
||||
int err;
|
||||
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
|
||||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
stab = kzalloc(sizeof(*stab), GFP_USER);
|
||||
stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!stab)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
bpf_map_init_from_attr(&stab->map, attr);
|
||||
raw_spin_lock_init(&stab->lock);
|
||||
|
||||
/* Make sure page count doesn't overflow. */
|
||||
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
|
||||
err = bpf_map_charge_init(&stab->map.memory, cost);
|
||||
if (err)
|
||||
goto free_stab;
|
||||
|
||||
stab->sks = bpf_map_area_alloc(stab->map.max_entries *
|
||||
sizeof(struct sock *),
|
||||
stab->map.numa_node);
|
||||
if (stab->sks)
|
||||
return &stab->map;
|
||||
err = -ENOMEM;
|
||||
bpf_map_charge_finish(&stab->map.memory);
|
||||
free_stab:
|
||||
kfree(stab);
|
||||
return ERR_PTR(err);
|
||||
if (!stab->sks) {
|
||||
kfree(stab);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
return &stab->map;
|
||||
}
|
||||
|
||||
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
|
||||
@ -975,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
|
||||
}
|
||||
}
|
||||
|
||||
new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
|
||||
htab->map.numa_node);
|
||||
new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
|
||||
GFP_ATOMIC | __GFP_NOWARN,
|
||||
htab->map.numa_node);
|
||||
if (!new) {
|
||||
atomic_dec(&htab->count);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_shtab *htab;
|
||||
int i, err;
|
||||
u64 cost;
|
||||
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
|
||||
if (attr->key_size > MAX_BPF_STACK)
|
||||
return ERR_PTR(-E2BIG);
|
||||
|
||||
htab = kzalloc(sizeof(*htab), GFP_USER);
|
||||
htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
|
||||
if (!htab)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
|
||||
goto free_htab;
|
||||
}
|
||||
|
||||
cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) +
|
||||
(u64) htab->elem_size * htab->map.max_entries;
|
||||
if (cost >= U32_MAX - PAGE_SIZE) {
|
||||
err = -EINVAL;
|
||||
goto free_htab;
|
||||
}
|
||||
err = bpf_map_charge_init(&htab->map.memory, cost);
|
||||
if (err)
|
||||
goto free_htab;
|
||||
|
||||
htab->buckets = bpf_map_area_alloc(htab->buckets_num *
|
||||
sizeof(struct bpf_shtab_bucket),
|
||||
htab->map.numa_node);
|
||||
if (!htab->buckets) {
|
||||
bpf_map_charge_finish(&htab->map.memory);
|
||||
err = -ENOMEM;
|
||||
goto free_htab;
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
|
||||
|
||||
/* Returns 0 on success, negative on failure */
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
struct net_device *dev, u32 queue_index)
|
||||
struct net_device *dev, u32 queue_index, unsigned int napi_id)
|
||||
{
|
||||
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
|
||||
WARN(1, "Driver promised not to register this");
|
||||
@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
xdp_rxq_info_init(xdp_rxq);
|
||||
xdp_rxq->dev = dev;
|
||||
xdp_rxq->queue_index = queue_index;
|
||||
xdp_rxq->napi_id = napi_id;
|
||||
|
||||
xdp_rxq->reg_state = REG_STATE_REGISTERED;
|
||||
return 0;
|
||||
|
@ -450,7 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
|
||||
/* BPF prog is run before any checks are done so that if the prog
|
||||
* changes context in a wrong way it will be caught.
|
||||
*/
|
||||
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
|
||||
err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
|
@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
|
||||
}
|
||||
|
||||
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *t, int off,
|
||||
int size, enum bpf_access_type atype,
|
||||
u32 *next_btf_id)
|
||||
@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
|
||||
size_t end;
|
||||
|
||||
if (atype == BPF_READ)
|
||||
return btf_struct_access(log, t, off, size, atype, next_btf_id);
|
||||
return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
|
||||
|
||||
if (t != tcp_sock_type) {
|
||||
bpf_log(log, "only read is supported\n");
|
||||
|
@ -3042,6 +3042,21 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
|
||||
|
||||
int tcp_set_window_clamp(struct sock *sk, int val)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if (!val) {
|
||||
if (sk->sk_state != TCP_CLOSE)
|
||||
return -EINVAL;
|
||||
tp->window_clamp = 0;
|
||||
} else {
|
||||
tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
|
||||
SOCK_MIN_RCVBUF / 2 : val;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Socket option code for TCP.
|
||||
*/
|
||||
@ -3255,15 +3270,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
|
||||
break;
|
||||
|
||||
case TCP_WINDOW_CLAMP:
|
||||
if (!val) {
|
||||
if (sk->sk_state != TCP_CLOSE) {
|
||||
err = -EINVAL;
|
||||
break;
|
||||
}
|
||||
tp->window_clamp = 0;
|
||||
} else
|
||||
tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
|
||||
SOCK_MIN_RCVBUF / 2 : val;
|
||||
err = tcp_set_window_clamp(sk, val);
|
||||
break;
|
||||
|
||||
case TCP_QUICKACK:
|
||||
|
@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
|
||||
/* BPF prog is run before any checks are done so that if the prog
|
||||
* changes context in a wrong way it will be caught.
|
||||
*/
|
||||
err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
|
||||
err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
|
114
net/xdp/xsk.c
114
net/xdp/xsk.c
@ -23,6 +23,7 @@
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/rculist.h>
|
||||
#include <net/xdp_sock_drv.h>
|
||||
#include <net/busy_poll.h>
|
||||
#include <net/xdp.h>
|
||||
|
||||
#include "xsk_queue.h"
|
||||
@ -232,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
|
||||
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
|
||||
return -EINVAL;
|
||||
|
||||
sk_mark_napi_id_once_xdp(&xs->sk, xdp);
|
||||
len = xdp->data_end - xdp->data;
|
||||
|
||||
return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
|
||||
@ -332,6 +334,63 @@ out:
|
||||
}
|
||||
EXPORT_SYMBOL(xsk_tx_peek_desc);
|
||||
|
||||
static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
|
||||
u32 max_entries)
|
||||
{
|
||||
u32 nb_pkts = 0;
|
||||
|
||||
while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
|
||||
nb_pkts++;
|
||||
|
||||
xsk_tx_release(pool);
|
||||
return nb_pkts;
|
||||
}
|
||||
|
||||
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
|
||||
u32 max_entries)
|
||||
{
|
||||
struct xdp_sock *xs;
|
||||
u32 nb_pkts;
|
||||
|
||||
rcu_read_lock();
|
||||
if (!list_is_singular(&pool->xsk_tx_list)) {
|
||||
/* Fallback to the non-batched version */
|
||||
rcu_read_unlock();
|
||||
return xsk_tx_peek_release_fallback(pool, descs, max_entries);
|
||||
}
|
||||
|
||||
xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
|
||||
if (!xs) {
|
||||
nb_pkts = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
|
||||
if (!nb_pkts) {
|
||||
xs->tx->queue_empty_descs++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* This is the backpressure mechanism for the Tx path. Try to
|
||||
* reserve space in the completion queue for all packets, but
|
||||
* if there are fewer slots available, just process that many
|
||||
* packets. This avoids having to implement any buffering in
|
||||
* the Tx path.
|
||||
*/
|
||||
nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
|
||||
if (!nb_pkts)
|
||||
goto out;
|
||||
|
||||
xskq_cons_release_n(xs->tx, nb_pkts);
|
||||
__xskq_cons_release(xs->tx);
|
||||
xs->sk.sk_write_space(&xs->sk);
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return nb_pkts;
|
||||
}
|
||||
EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
|
||||
|
||||
static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
|
||||
{
|
||||
struct net_device *dev = xs->dev;
|
||||
@ -454,18 +513,65 @@ static int __xsk_sendmsg(struct sock *sk)
|
||||
return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
|
||||
}
|
||||
|
||||
static bool xsk_no_wakeup(struct sock *sk)
|
||||
{
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
/* Prefer busy-polling, skip the wakeup. */
|
||||
return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
|
||||
READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
|
||||
{
|
||||
bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
|
||||
struct sock *sk = sock->sk;
|
||||
struct xdp_sock *xs = xdp_sk(sk);
|
||||
struct xsk_buff_pool *pool;
|
||||
|
||||
if (unlikely(!xsk_is_bound(xs)))
|
||||
return -ENXIO;
|
||||
if (unlikely(need_wait))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
return __xsk_sendmsg(sk);
|
||||
if (sk_can_busy_loop(sk))
|
||||
sk_busy_loop(sk, 1); /* only support non-blocking sockets */
|
||||
|
||||
if (xsk_no_wakeup(sk))
|
||||
return 0;
|
||||
|
||||
pool = xs->pool;
|
||||
if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
|
||||
return __xsk_sendmsg(sk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
|
||||
{
|
||||
bool need_wait = !(flags & MSG_DONTWAIT);
|
||||
struct sock *sk = sock->sk;
|
||||
struct xdp_sock *xs = xdp_sk(sk);
|
||||
|
||||
if (unlikely(!(xs->dev->flags & IFF_UP)))
|
||||
return -ENETDOWN;
|
||||
if (unlikely(!xs->rx))
|
||||
return -ENOBUFS;
|
||||
if (unlikely(!xsk_is_bound(xs)))
|
||||
return -ENXIO;
|
||||
if (unlikely(need_wait))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (sk_can_busy_loop(sk))
|
||||
sk_busy_loop(sk, 1); /* only support non-blocking sockets */
|
||||
|
||||
if (xsk_no_wakeup(sk))
|
||||
return 0;
|
||||
|
||||
if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
|
||||
return xsk_wakeup(xs, XDP_WAKEUP_RX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __poll_t xsk_poll(struct file *file, struct socket *sock,
|
||||
@ -542,7 +648,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
|
||||
node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
|
||||
node);
|
||||
if (node) {
|
||||
WARN_ON(xsk_map_inc(node->map));
|
||||
bpf_map_inc(&node->map->map);
|
||||
map = node->map;
|
||||
*map_entry = node->map_entry;
|
||||
}
|
||||
@ -572,7 +678,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
|
||||
|
||||
while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
|
||||
xsk_map_try_sock_delete(map, xs, map_entry);
|
||||
xsk_map_put(map);
|
||||
bpf_map_put(&map->map);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1128,7 +1234,7 @@ static const struct proto_ops xsk_proto_ops = {
|
||||
.setsockopt = xsk_setsockopt,
|
||||
.getsockopt = xsk_getsockopt,
|
||||
.sendmsg = xsk_sendmsg,
|
||||
.recvmsg = sock_no_recvmsg,
|
||||
.recvmsg = xsk_recvmsg,
|
||||
.mmap = xsk_mmap,
|
||||
.sendpage = sock_no_sendpage,
|
||||
};
|
||||
|
@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
|
||||
|
||||
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
|
||||
struct xdp_sock **map_entry);
|
||||
int xsk_map_inc(struct xsk_map *map);
|
||||
void xsk_map_put(struct xsk_map *map);
|
||||
void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
|
||||
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
|
||||
u16 queue_id);
|
||||
|
@ -144,14 +144,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (flags & XDP_USE_NEED_WAKEUP) {
|
||||
if (flags & XDP_USE_NEED_WAKEUP)
|
||||
pool->uses_need_wakeup = true;
|
||||
/* Tx needs to be explicitly woken up the first time.
|
||||
* Also for supporting drivers that do not implement this
|
||||
* feature. They will always have to call sendto().
|
||||
*/
|
||||
pool->cached_need_wakeup = XDP_WAKEUP_TX;
|
||||
}
|
||||
/* Tx needs to be explicitly woken up the first time. Also
|
||||
* for supporting drivers that do not implement this
|
||||
* feature. They will always have to call sendto() or poll().
|
||||
*/
|
||||
pool->cached_need_wakeup = XDP_WAKEUP_TX;
|
||||
|
||||
dev_hold(netdev);
|
||||
|
||||
|
@ -18,9 +18,11 @@ struct xdp_ring {
|
||||
/* Hinder the adjacent cache prefetcher to prefetch the consumer
|
||||
* pointer if the producer pointer is touched and vice versa.
|
||||
*/
|
||||
u32 pad ____cacheline_aligned_in_smp;
|
||||
u32 pad1 ____cacheline_aligned_in_smp;
|
||||
u32 consumer ____cacheline_aligned_in_smp;
|
||||
u32 pad2 ____cacheline_aligned_in_smp;
|
||||
u32 flags;
|
||||
u32 pad3 ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
/* Used for the RX and TX queues for packets */
|
||||
@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q,
|
||||
struct xdp_desc *descs,
|
||||
struct xsk_buff_pool *pool, u32 max)
|
||||
{
|
||||
u32 cached_cons = q->cached_cons, nb_entries = 0;
|
||||
|
||||
while (cached_cons != q->cached_prod && nb_entries < max) {
|
||||
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
|
||||
u32 idx = cached_cons & q->ring_mask;
|
||||
|
||||
descs[nb_entries] = ring->desc[idx];
|
||||
if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) {
|
||||
/* Skip the entry */
|
||||
cached_cons++;
|
||||
continue;
|
||||
}
|
||||
|
||||
nb_entries++;
|
||||
cached_cons++;
|
||||
}
|
||||
|
||||
return nb_entries;
|
||||
}
|
||||
|
||||
/* Functions for consumers */
|
||||
|
||||
static inline void __xskq_cons_release(struct xsk_queue *q)
|
||||
@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q)
|
||||
__xskq_cons_peek(q);
|
||||
}
|
||||
|
||||
static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
|
||||
static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max)
|
||||
{
|
||||
u32 entries = q->cached_prod - q->cached_cons;
|
||||
|
||||
if (entries >= cnt)
|
||||
return true;
|
||||
if (entries >= max)
|
||||
return max;
|
||||
|
||||
__xskq_cons_peek(q);
|
||||
entries = q->cached_prod - q->cached_cons;
|
||||
|
||||
return entries >= cnt;
|
||||
return entries >= max ? max : entries;
|
||||
}
|
||||
|
||||
static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
|
||||
{
|
||||
return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false;
|
||||
}
|
||||
|
||||
static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
|
||||
@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
|
||||
return xskq_cons_read_desc(q, desc, pool);
|
||||
}
|
||||
|
||||
static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs,
|
||||
struct xsk_buff_pool *pool, u32 max)
|
||||
{
|
||||
u32 entries = xskq_cons_nb_entries(q, max);
|
||||
|
||||
return xskq_cons_read_desc_batch(q, descs, pool, entries);
|
||||
}
|
||||
|
||||
/* To improve performance in the xskq_cons_release functions, only update local state here.
|
||||
* Reflect this to global state when we get new entries from the ring in
|
||||
* xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop.
|
||||
*/
|
||||
static inline void xskq_cons_release(struct xsk_queue *q)
|
||||
{
|
||||
/* To improve performance, only update local state here.
|
||||
* Reflect this to global state when we get new entries
|
||||
* from the ring in xskq_cons_get_entries() and whenever
|
||||
* Rx or Tx processing are completed in the NAPI loop.
|
||||
*/
|
||||
q->cached_cons++;
|
||||
}
|
||||
|
||||
static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
|
||||
{
|
||||
q->cached_cons += cnt;
|
||||
}
|
||||
|
||||
static inline bool xskq_cons_is_full(struct xsk_queue *q)
|
||||
{
|
||||
/* No barriers needed since data is not accessed */
|
||||
@ -266,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q)
|
||||
|
||||
/* Functions for producers */
|
||||
|
||||
static inline bool xskq_prod_is_full(struct xsk_queue *q)
|
||||
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
|
||||
{
|
||||
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
|
||||
|
||||
if (free_entries)
|
||||
return false;
|
||||
if (free_entries >= max)
|
||||
return max;
|
||||
|
||||
/* Refresh the local tail pointer */
|
||||
q->cached_cons = READ_ONCE(q->ring->consumer);
|
||||
free_entries = q->nentries - (q->cached_prod - q->cached_cons);
|
||||
|
||||
return !free_entries;
|
||||
return free_entries >= max ? max : free_entries;
|
||||
}
|
||||
|
||||
static inline bool xskq_prod_is_full(struct xsk_queue *q)
|
||||
{
|
||||
return xskq_prod_nb_free(q, 1) ? false : true;
|
||||
}
|
||||
|
||||
static inline int xskq_prod_reserve(struct xsk_queue *q)
|
||||
@ -302,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
|
||||
u32 max)
|
||||
{
|
||||
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
|
||||
u32 nb_entries, i, cached_prod;
|
||||
|
||||
nb_entries = xskq_prod_nb_free(q, max);
|
||||
|
||||
/* A, matches D */
|
||||
cached_prod = q->cached_prod;
|
||||
for (i = 0; i < nb_entries; i++)
|
||||
ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
|
||||
q->cached_prod = cached_prod;
|
||||
|
||||
return nb_entries;
|
||||
}
|
||||
|
||||
static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
|
||||
u64 addr, u32 len)
|
||||
{
|
||||
|
@ -11,32 +11,17 @@
|
||||
|
||||
#include "xsk.h"
|
||||
|
||||
int xsk_map_inc(struct xsk_map *map)
|
||||
{
|
||||
bpf_map_inc(&map->map);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void xsk_map_put(struct xsk_map *map)
|
||||
{
|
||||
bpf_map_put(&map->map);
|
||||
}
|
||||
|
||||
static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
|
||||
struct xdp_sock **map_entry)
|
||||
{
|
||||
struct xsk_map_node *node;
|
||||
int err;
|
||||
|
||||
node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
|
||||
node = bpf_map_kzalloc(&map->map, sizeof(*node),
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!node)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
err = xsk_map_inc(map);
|
||||
if (err) {
|
||||
kfree(node);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
bpf_map_inc(&map->map);
|
||||
|
||||
node->map = map;
|
||||
node->map_entry = map_entry;
|
||||
@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
|
||||
|
||||
static void xsk_map_node_free(struct xsk_map_node *node)
|
||||
{
|
||||
xsk_map_put(node->map);
|
||||
bpf_map_put(&node->map->map);
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs,
|
||||
|
||||
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
struct bpf_map_memory mem;
|
||||
int err, numa_node;
|
||||
struct xsk_map *m;
|
||||
int numa_node;
|
||||
u64 size;
|
||||
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
|
||||
numa_node = bpf_map_attr_numa_node(attr);
|
||||
size = struct_size(m, xsk_map, attr->max_entries);
|
||||
|
||||
err = bpf_map_charge_init(&mem, size);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
m = bpf_map_area_alloc(size, numa_node);
|
||||
if (!m) {
|
||||
bpf_map_charge_finish(&mem);
|
||||
if (!m)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
bpf_map_init_from_attr(&m->map, attr);
|
||||
bpf_map_charge_move(&m->map.memory, &mem);
|
||||
spin_lock_init(&m->lock);
|
||||
|
||||
return &m->map;
|
||||
|
3
samples/bpf/.gitignore
vendored
3
samples/bpf/.gitignore
vendored
@ -52,3 +52,6 @@ xdp_tx_iptunnel
|
||||
xdpsock
|
||||
xsk_fwd
|
||||
testfile.img
|
||||
hbm_out.log
|
||||
iperf.*
|
||||
*.out
|
||||
|
@ -48,6 +48,7 @@ tprogs-y += syscall_tp
|
||||
tprogs-y += cpustat
|
||||
tprogs-y += xdp_adjust_tail
|
||||
tprogs-y += xdpsock
|
||||
tprogs-y += xdpsock_ctrl_proc
|
||||
tprogs-y += xsk_fwd
|
||||
tprogs-y += xdp_fwd
|
||||
tprogs-y += task_fd_query
|
||||
@ -73,16 +74,16 @@ tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
|
||||
tracex6-objs := tracex6_user.o
|
||||
tracex7-objs := tracex7_user.o
|
||||
test_probe_write_user-objs := test_probe_write_user_user.o
|
||||
trace_output-objs := trace_output_user.o $(TRACE_HELPERS)
|
||||
trace_output-objs := trace_output_user.o
|
||||
lathist-objs := lathist_user.o
|
||||
offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS)
|
||||
spintest-objs := spintest_user.o $(TRACE_HELPERS)
|
||||
map_perf_test-objs := map_perf_test_user.o
|
||||
test_overhead-objs := bpf_load.o test_overhead_user.o
|
||||
test_overhead-objs := test_overhead_user.o
|
||||
test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
|
||||
test_cgrp2_attach-objs := test_cgrp2_attach.o
|
||||
test_cgrp2_sock-objs := test_cgrp2_sock.o
|
||||
test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
|
||||
test_cgrp2_sock2-objs := test_cgrp2_sock2.o
|
||||
xdp1-objs := xdp1_user.o
|
||||
# reuse xdp1 source intentionally
|
||||
xdp2-objs := xdp1_user.o
|
||||
@ -91,8 +92,8 @@ test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
|
||||
test_current_task_under_cgroup_user.o
|
||||
trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
|
||||
sampleip-objs := sampleip_user.o $(TRACE_HELPERS)
|
||||
tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
|
||||
lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
|
||||
tc_l2_redirect-objs := tc_l2_redirect_user.o
|
||||
lwt_len_hist-objs := lwt_len_hist_user.o
|
||||
xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o
|
||||
test_map_in_map-objs := test_map_in_map_user.o
|
||||
per_socket_stats_example-objs := cookie_uid_helper_example.o
|
||||
@ -105,12 +106,13 @@ syscall_tp-objs := syscall_tp_user.o
|
||||
cpustat-objs := cpustat_user.o
|
||||
xdp_adjust_tail-objs := xdp_adjust_tail_user.o
|
||||
xdpsock-objs := xdpsock_user.o
|
||||
xdpsock_ctrl_proc-objs := xdpsock_ctrl_proc.o
|
||||
xsk_fwd-objs := xsk_fwd.o
|
||||
xdp_fwd-objs := xdp_fwd_user.o
|
||||
task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
|
||||
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
|
||||
ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
|
||||
hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
|
||||
task_fd_query-objs := task_fd_query_user.o $(TRACE_HELPERS)
|
||||
xdp_sample_pkts-objs := xdp_sample_pkts_user.o
|
||||
ibumad-objs := ibumad_user.o
|
||||
hbm-objs := hbm.o $(CGROUP_HELPERS)
|
||||
|
||||
# Tell kbuild to always build the programs
|
||||
always-y := $(tprogs-y)
|
||||
@ -197,14 +199,12 @@ TPROGS_CFLAGS += --sysroot=$(SYSROOT)
|
||||
TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib
|
||||
endif
|
||||
|
||||
TPROGCFLAGS_bpf_load.o += -Wno-unused-variable
|
||||
|
||||
TPROGS_LDLIBS += $(LIBBPF) -lelf -lz
|
||||
TPROGLDLIBS_tracex4 += -lrt
|
||||
TPROGLDLIBS_trace_output += -lrt
|
||||
TPROGLDLIBS_map_perf_test += -lrt
|
||||
TPROGLDLIBS_test_overhead += -lrt
|
||||
TPROGLDLIBS_xdpsock += -pthread
|
||||
TPROGLDLIBS_xdpsock += -pthread -lcap
|
||||
TPROGLDLIBS_xsk_fwd += -pthread
|
||||
|
||||
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
|
||||
|
@ -1,667 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <libelf.h>
|
||||
#include <gelf.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <poll.h>
|
||||
#include <ctype.h>
|
||||
#include <assert.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include "bpf_load.h"
|
||||
#include "perf-sys.h"
|
||||
|
||||
#define DEBUGFS "/sys/kernel/debug/tracing/"
|
||||
|
||||
static char license[128];
|
||||
static int kern_version;
|
||||
static bool processed_sec[128];
|
||||
char bpf_log_buf[BPF_LOG_BUF_SIZE];
|
||||
int map_fd[MAX_MAPS];
|
||||
int prog_fd[MAX_PROGS];
|
||||
int event_fd[MAX_PROGS];
|
||||
int prog_cnt;
|
||||
int prog_array_fd = -1;
|
||||
|
||||
struct bpf_map_data map_data[MAX_MAPS];
|
||||
int map_data_count;
|
||||
|
||||
static int populate_prog_array(const char *event, int prog_fd)
|
||||
{
|
||||
int ind = atoi(event), err;
|
||||
|
||||
err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
|
||||
if (err < 0) {
|
||||
printf("failed to store prog_fd in prog_array\n");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int write_kprobe_events(const char *val)
|
||||
{
|
||||
int fd, ret, flags;
|
||||
|
||||
if (val == NULL)
|
||||
return -1;
|
||||
else if (val[0] == '\0')
|
||||
flags = O_WRONLY | O_TRUNC;
|
||||
else
|
||||
flags = O_WRONLY | O_APPEND;
|
||||
|
||||
fd = open(DEBUGFS "kprobe_events", flags);
|
||||
|
||||
ret = write(fd, val, strlen(val));
|
||||
close(fd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
|
||||
{
|
||||
bool is_socket = strncmp(event, "socket", 6) == 0;
|
||||
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
|
||||
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
|
||||
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
|
||||
bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
|
||||
bool is_xdp = strncmp(event, "xdp", 3) == 0;
|
||||
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
|
||||
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
|
||||
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
|
||||
bool is_sockops = strncmp(event, "sockops", 7) == 0;
|
||||
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
|
||||
bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
|
||||
size_t insns_cnt = size / sizeof(struct bpf_insn);
|
||||
enum bpf_prog_type prog_type;
|
||||
char buf[256];
|
||||
int fd, efd, err, id;
|
||||
struct perf_event_attr attr = {};
|
||||
|
||||
attr.type = PERF_TYPE_TRACEPOINT;
|
||||
attr.sample_type = PERF_SAMPLE_RAW;
|
||||
attr.sample_period = 1;
|
||||
attr.wakeup_events = 1;
|
||||
|
||||
if (is_socket) {
|
||||
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
|
||||
} else if (is_kprobe || is_kretprobe) {
|
||||
prog_type = BPF_PROG_TYPE_KPROBE;
|
||||
} else if (is_tracepoint) {
|
||||
prog_type = BPF_PROG_TYPE_TRACEPOINT;
|
||||
} else if (is_raw_tracepoint) {
|
||||
prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
|
||||
} else if (is_xdp) {
|
||||
prog_type = BPF_PROG_TYPE_XDP;
|
||||
} else if (is_perf_event) {
|
||||
prog_type = BPF_PROG_TYPE_PERF_EVENT;
|
||||
} else if (is_cgroup_skb) {
|
||||
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
|
||||
} else if (is_cgroup_sk) {
|
||||
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
|
||||
} else if (is_sockops) {
|
||||
prog_type = BPF_PROG_TYPE_SOCK_OPS;
|
||||
} else if (is_sk_skb) {
|
||||
prog_type = BPF_PROG_TYPE_SK_SKB;
|
||||
} else if (is_sk_msg) {
|
||||
prog_type = BPF_PROG_TYPE_SK_MSG;
|
||||
} else {
|
||||
printf("Unknown event '%s'\n", event);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (prog_cnt == MAX_PROGS)
|
||||
return -1;
|
||||
|
||||
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
|
||||
bpf_log_buf, BPF_LOG_BUF_SIZE);
|
||||
if (fd < 0) {
|
||||
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
prog_fd[prog_cnt++] = fd;
|
||||
|
||||
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
|
||||
return 0;
|
||||
|
||||
if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
|
||||
if (is_socket)
|
||||
event += 6;
|
||||
else
|
||||
event += 7;
|
||||
if (*event != '/')
|
||||
return 0;
|
||||
event++;
|
||||
if (!isdigit(*event)) {
|
||||
printf("invalid prog number\n");
|
||||
return -1;
|
||||
}
|
||||
return populate_prog_array(event, fd);
|
||||
}
|
||||
|
||||
if (is_raw_tracepoint) {
|
||||
efd = bpf_raw_tracepoint_open(event + 15, fd);
|
||||
if (efd < 0) {
|
||||
printf("tracepoint %s %s\n", event + 15, strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
event_fd[prog_cnt - 1] = efd;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (is_kprobe || is_kretprobe) {
|
||||
bool need_normal_check = true;
|
||||
const char *event_prefix = "";
|
||||
|
||||
if (is_kprobe)
|
||||
event += 7;
|
||||
else
|
||||
event += 10;
|
||||
|
||||
if (*event == 0) {
|
||||
printf("event name cannot be empty\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (isdigit(*event))
|
||||
return populate_prog_array(event, fd);
|
||||
|
||||
#ifdef __x86_64__
|
||||
if (strncmp(event, "sys_", 4) == 0) {
|
||||
snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s",
|
||||
is_kprobe ? 'p' : 'r', event, event);
|
||||
err = write_kprobe_events(buf);
|
||||
if (err >= 0) {
|
||||
need_normal_check = false;
|
||||
event_prefix = "__x64_";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (need_normal_check) {
|
||||
snprintf(buf, sizeof(buf), "%c:%s %s",
|
||||
is_kprobe ? 'p' : 'r', event, event);
|
||||
err = write_kprobe_events(buf);
|
||||
if (err < 0) {
|
||||
printf("failed to create kprobe '%s' error '%s'\n",
|
||||
event, strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
strcpy(buf, DEBUGFS);
|
||||
strcat(buf, "events/kprobes/");
|
||||
strcat(buf, event_prefix);
|
||||
strcat(buf, event);
|
||||
strcat(buf, "/id");
|
||||
} else if (is_tracepoint) {
|
||||
event += 11;
|
||||
|
||||
if (*event == 0) {
|
||||
printf("event name cannot be empty\n");
|
||||
return -1;
|
||||
}
|
||||
strcpy(buf, DEBUGFS);
|
||||
strcat(buf, "events/");
|
||||
strcat(buf, event);
|
||||
strcat(buf, "/id");
|
||||
}
|
||||
|
||||
efd = open(buf, O_RDONLY, 0);
|
||||
if (efd < 0) {
|
||||
printf("failed to open event %s\n", event);
|
||||
return -1;
|
||||
}
|
||||
|
||||
err = read(efd, buf, sizeof(buf));
|
||||
if (err < 0 || err >= sizeof(buf)) {
|
||||
printf("read from '%s' failed '%s'\n", event, strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
close(efd);
|
||||
|
||||
buf[err] = 0;
|
||||
id = atoi(buf);
|
||||
attr.config = id;
|
||||
|
||||
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
|
||||
if (efd < 0) {
|
||||
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
event_fd[prog_cnt - 1] = efd;
|
||||
err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
|
||||
if (err < 0) {
|
||||
printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
|
||||
if (err < 0) {
|
||||
printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int load_maps(struct bpf_map_data *maps, int nr_maps,
|
||||
fixup_map_cb fixup_map)
|
||||
{
|
||||
int i, numa_node;
|
||||
|
||||
for (i = 0; i < nr_maps; i++) {
|
||||
if (fixup_map) {
|
||||
fixup_map(&maps[i], i);
|
||||
/* Allow userspace to assign map FD prior to creation */
|
||||
if (maps[i].fd != -1) {
|
||||
map_fd[i] = maps[i].fd;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
|
||||
maps[i].def.numa_node : -1;
|
||||
|
||||
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
|
||||
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
|
||||
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
|
||||
|
||||
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
|
||||
maps[i].name,
|
||||
maps[i].def.key_size,
|
||||
inner_map_fd,
|
||||
maps[i].def.max_entries,
|
||||
maps[i].def.map_flags,
|
||||
numa_node);
|
||||
} else {
|
||||
map_fd[i] = bpf_create_map_node(maps[i].def.type,
|
||||
maps[i].name,
|
||||
maps[i].def.key_size,
|
||||
maps[i].def.value_size,
|
||||
maps[i].def.max_entries,
|
||||
maps[i].def.map_flags,
|
||||
numa_node);
|
||||
}
|
||||
if (map_fd[i] < 0) {
|
||||
printf("failed to create map %d (%s): %d %s\n",
|
||||
i, maps[i].name, errno, strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
maps[i].fd = map_fd[i];
|
||||
|
||||
if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
|
||||
prog_array_fd = map_fd[i];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
|
||||
GElf_Shdr *shdr, Elf_Data **data)
|
||||
{
|
||||
Elf_Scn *scn;
|
||||
|
||||
scn = elf_getscn(elf, i);
|
||||
if (!scn)
|
||||
return 1;
|
||||
|
||||
if (gelf_getshdr(scn, shdr) != shdr)
|
||||
return 2;
|
||||
|
||||
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
|
||||
if (!*shname || !shdr->sh_size)
|
||||
return 3;
|
||||
|
||||
*data = elf_getdata(scn, 0);
|
||||
if (!*data || elf_getdata(scn, *data) != NULL)
|
||||
return 4;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
|
||||
GElf_Shdr *shdr, struct bpf_insn *insn,
|
||||
struct bpf_map_data *maps, int nr_maps)
|
||||
{
|
||||
int i, nrels;
|
||||
|
||||
nrels = shdr->sh_size / shdr->sh_entsize;
|
||||
|
||||
for (i = 0; i < nrels; i++) {
|
||||
GElf_Sym sym;
|
||||
GElf_Rel rel;
|
||||
unsigned int insn_idx;
|
||||
bool match = false;
|
||||
int j, map_idx;
|
||||
|
||||
gelf_getrel(data, i, &rel);
|
||||
|
||||
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
|
||||
|
||||
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
|
||||
|
||||
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
|
||||
printf("invalid relo for insn[%d].code 0x%x\n",
|
||||
insn_idx, insn[insn_idx].code);
|
||||
return 1;
|
||||
}
|
||||
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
|
||||
|
||||
/* Match FD relocation against recorded map_data[] offset */
|
||||
for (map_idx = 0; map_idx < nr_maps; map_idx++) {
|
||||
if (maps[map_idx].elf_offset == sym.st_value) {
|
||||
match = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
insn[insn_idx].imm = maps[map_idx].fd;
|
||||
} else {
|
||||
printf("invalid relo for insn[%d] no map_data match\n",
|
||||
insn_idx);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cmp_symbols(const void *l, const void *r)
|
||||
{
|
||||
const GElf_Sym *lsym = (const GElf_Sym *)l;
|
||||
const GElf_Sym *rsym = (const GElf_Sym *)r;
|
||||
|
||||
if (lsym->st_value < rsym->st_value)
|
||||
return -1;
|
||||
else if (lsym->st_value > rsym->st_value)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
|
||||
Elf *elf, Elf_Data *symbols, int strtabidx)
|
||||
{
|
||||
int map_sz_elf, map_sz_copy;
|
||||
bool validate_zero = false;
|
||||
Elf_Data *data_maps;
|
||||
int i, nr_maps;
|
||||
GElf_Sym *sym;
|
||||
Elf_Scn *scn;
|
||||
int copy_sz;
|
||||
|
||||
if (maps_shndx < 0)
|
||||
return -EINVAL;
|
||||
if (!symbols)
|
||||
return -EINVAL;
|
||||
|
||||
/* Get data for maps section via elf index */
|
||||
scn = elf_getscn(elf, maps_shndx);
|
||||
if (scn)
|
||||
data_maps = elf_getdata(scn, NULL);
|
||||
if (!scn || !data_maps) {
|
||||
printf("Failed to get Elf_Data from maps section %d\n",
|
||||
maps_shndx);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* For each map get corrosponding symbol table entry */
|
||||
sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
|
||||
for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
|
||||
assert(nr_maps < MAX_MAPS+1);
|
||||
if (!gelf_getsym(symbols, i, &sym[nr_maps]))
|
||||
continue;
|
||||
if (sym[nr_maps].st_shndx != maps_shndx)
|
||||
continue;
|
||||
/* Only increment iif maps section */
|
||||
nr_maps++;
|
||||
}
|
||||
|
||||
/* Align to map_fd[] order, via sort on offset in sym.st_value */
|
||||
qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
|
||||
|
||||
/* Keeping compatible with ELF maps section changes
|
||||
* ------------------------------------------------
|
||||
* The program size of struct bpf_load_map_def is known by loader
|
||||
* code, but struct stored in ELF file can be different.
|
||||
*
|
||||
* Unfortunately sym[i].st_size is zero. To calculate the
|
||||
* struct size stored in the ELF file, assume all struct have
|
||||
* the same size, and simply divide with number of map
|
||||
* symbols.
|
||||
*/
|
||||
map_sz_elf = data_maps->d_size / nr_maps;
|
||||
map_sz_copy = sizeof(struct bpf_load_map_def);
|
||||
if (map_sz_elf < map_sz_copy) {
|
||||
/*
|
||||
* Backward compat, loading older ELF file with
|
||||
* smaller struct, keeping remaining bytes zero.
|
||||
*/
|
||||
map_sz_copy = map_sz_elf;
|
||||
} else if (map_sz_elf > map_sz_copy) {
|
||||
/*
|
||||
* Forward compat, loading newer ELF file with larger
|
||||
* struct with unknown features. Assume zero means
|
||||
* feature not used. Thus, validate rest of struct
|
||||
* data is zero.
|
||||
*/
|
||||
validate_zero = true;
|
||||
}
|
||||
|
||||
/* Memcpy relevant part of ELF maps data to loader maps */
|
||||
for (i = 0; i < nr_maps; i++) {
|
||||
struct bpf_load_map_def *def;
|
||||
unsigned char *addr, *end;
|
||||
const char *map_name;
|
||||
size_t offset;
|
||||
|
||||
map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
|
||||
maps[i].name = strdup(map_name);
|
||||
if (!maps[i].name) {
|
||||
printf("strdup(%s): %s(%d)\n", map_name,
|
||||
strerror(errno), errno);
|
||||
free(sym);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
/* Symbol value is offset into ELF maps section data area */
|
||||
offset = sym[i].st_value;
|
||||
def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
|
||||
maps[i].elf_offset = offset;
|
||||
memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
|
||||
memcpy(&maps[i].def, def, map_sz_copy);
|
||||
|
||||
/* Verify no newer features were requested */
|
||||
if (validate_zero) {
|
||||
addr = (unsigned char *) def + map_sz_copy;
|
||||
end = (unsigned char *) def + map_sz_elf;
|
||||
for (; addr < end; addr++) {
|
||||
if (*addr != 0) {
|
||||
free(sym);
|
||||
return -EFBIG;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(sym);
|
||||
return nr_maps;
|
||||
}
|
||||
|
||||
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
|
||||
{
|
||||
int fd, i, ret, maps_shndx = -1, strtabidx = -1;
|
||||
Elf *elf;
|
||||
GElf_Ehdr ehdr;
|
||||
GElf_Shdr shdr, shdr_prog;
|
||||
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
|
||||
char *shname, *shname_prog;
|
||||
int nr_maps = 0;
|
||||
|
||||
/* reset global variables */
|
||||
kern_version = 0;
|
||||
memset(license, 0, sizeof(license));
|
||||
memset(processed_sec, 0, sizeof(processed_sec));
|
||||
|
||||
if (elf_version(EV_CURRENT) == EV_NONE)
|
||||
return 1;
|
||||
|
||||
fd = open(path, O_RDONLY, 0);
|
||||
if (fd < 0)
|
||||
return 1;
|
||||
|
||||
elf = elf_begin(fd, ELF_C_READ, NULL);
|
||||
|
||||
if (!elf)
|
||||
return 1;
|
||||
|
||||
if (gelf_getehdr(elf, &ehdr) != &ehdr)
|
||||
return 1;
|
||||
|
||||
/* clear all kprobes */
|
||||
i = write_kprobe_events("");
|
||||
|
||||
/* scan over all elf sections to get license and map info */
|
||||
for (i = 1; i < ehdr.e_shnum; i++) {
|
||||
|
||||
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
||||
continue;
|
||||
|
||||
if (0) /* helpful for llvm debugging */
|
||||
printf("section %d:%s data %p size %zd link %d flags %d\n",
|
||||
i, shname, data->d_buf, data->d_size,
|
||||
shdr.sh_link, (int) shdr.sh_flags);
|
||||
|
||||
if (strcmp(shname, "license") == 0) {
|
||||
processed_sec[i] = true;
|
||||
memcpy(license, data->d_buf, data->d_size);
|
||||
} else if (strcmp(shname, "version") == 0) {
|
||||
processed_sec[i] = true;
|
||||
if (data->d_size != sizeof(int)) {
|
||||
printf("invalid size of version section %zd\n",
|
||||
data->d_size);
|
||||
return 1;
|
||||
}
|
||||
memcpy(&kern_version, data->d_buf, sizeof(int));
|
||||
} else if (strcmp(shname, "maps") == 0) {
|
||||
int j;
|
||||
|
||||
maps_shndx = i;
|
||||
data_maps = data;
|
||||
for (j = 0; j < MAX_MAPS; j++)
|
||||
map_data[j].fd = -1;
|
||||
} else if (shdr.sh_type == SHT_SYMTAB) {
|
||||
strtabidx = shdr.sh_link;
|
||||
symbols = data;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 1;
|
||||
|
||||
if (!symbols) {
|
||||
printf("missing SHT_SYMTAB section\n");
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (data_maps) {
|
||||
nr_maps = load_elf_maps_section(map_data, maps_shndx,
|
||||
elf, symbols, strtabidx);
|
||||
if (nr_maps < 0) {
|
||||
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
|
||||
nr_maps, strerror(-nr_maps));
|
||||
goto done;
|
||||
}
|
||||
if (load_maps(map_data, nr_maps, fixup_map))
|
||||
goto done;
|
||||
map_data_count = nr_maps;
|
||||
|
||||
processed_sec[maps_shndx] = true;
|
||||
}
|
||||
|
||||
/* process all relo sections, and rewrite bpf insns for maps */
|
||||
for (i = 1; i < ehdr.e_shnum; i++) {
|
||||
if (processed_sec[i])
|
||||
continue;
|
||||
|
||||
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
||||
continue;
|
||||
|
||||
if (shdr.sh_type == SHT_REL) {
|
||||
struct bpf_insn *insns;
|
||||
|
||||
/* locate prog sec that need map fixup (relocations) */
|
||||
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
|
||||
&shdr_prog, &data_prog))
|
||||
continue;
|
||||
|
||||
if (shdr_prog.sh_type != SHT_PROGBITS ||
|
||||
!(shdr_prog.sh_flags & SHF_EXECINSTR))
|
||||
continue;
|
||||
|
||||
insns = (struct bpf_insn *) data_prog->d_buf;
|
||||
processed_sec[i] = true; /* relo section */
|
||||
|
||||
if (parse_relo_and_apply(data, symbols, &shdr, insns,
|
||||
map_data, nr_maps))
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* load programs */
|
||||
for (i = 1; i < ehdr.e_shnum; i++) {
|
||||
|
||||
if (processed_sec[i])
|
||||
continue;
|
||||
|
||||
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
|
||||
continue;
|
||||
|
||||
if (memcmp(shname, "kprobe/", 7) == 0 ||
|
||||
memcmp(shname, "kretprobe/", 10) == 0 ||
|
||||
memcmp(shname, "tracepoint/", 11) == 0 ||
|
||||
memcmp(shname, "raw_tracepoint/", 15) == 0 ||
|
||||
memcmp(shname, "xdp", 3) == 0 ||
|
||||
memcmp(shname, "perf_event", 10) == 0 ||
|
||||
memcmp(shname, "socket", 6) == 0 ||
|
||||
memcmp(shname, "cgroup/", 7) == 0 ||
|
||||
memcmp(shname, "sockops", 7) == 0 ||
|
||||
memcmp(shname, "sk_skb", 6) == 0 ||
|
||||
memcmp(shname, "sk_msg", 6) == 0) {
|
||||
ret = load_and_attach(shname, data->d_buf,
|
||||
data->d_size);
|
||||
if (ret != 0)
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
close(fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int load_bpf_file(char *path)
|
||||
{
|
||||
return do_load_bpf_file(path, NULL);
|
||||
}
|
||||
|
||||
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
|
||||
{
|
||||
return do_load_bpf_file(path, fixup_map);
|
||||
}
|
@ -1,57 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef __BPF_LOAD_H
|
||||
#define __BPF_LOAD_H
|
||||
|
||||
#include <bpf/bpf.h>
|
||||
|
||||
#define MAX_MAPS 32
|
||||
#define MAX_PROGS 32
|
||||
|
||||
struct bpf_load_map_def {
|
||||
unsigned int type;
|
||||
unsigned int key_size;
|
||||
unsigned int value_size;
|
||||
unsigned int max_entries;
|
||||
unsigned int map_flags;
|
||||
unsigned int inner_map_idx;
|
||||
unsigned int numa_node;
|
||||
};
|
||||
|
||||
struct bpf_map_data {
|
||||
int fd;
|
||||
char *name;
|
||||
size_t elf_offset;
|
||||
struct bpf_load_map_def def;
|
||||
};
|
||||
|
||||
typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
|
||||
|
||||
extern int prog_fd[MAX_PROGS];
|
||||
extern int event_fd[MAX_PROGS];
|
||||
extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
|
||||
extern int prog_cnt;
|
||||
|
||||
/* There is a one-to-one mapping between map_fd[] and map_data[].
|
||||
* The map_data[] just contains more rich info on the given map.
|
||||
*/
|
||||
extern int map_fd[MAX_MAPS];
|
||||
extern struct bpf_map_data map_data[MAX_MAPS];
|
||||
extern int map_data_count;
|
||||
|
||||
/* parses elf file compiled by llvm .c->.o
|
||||
* . parses 'maps' section and creates maps via BPF syscall
|
||||
* . parses 'license' section and passes it to syscall
|
||||
* . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
|
||||
* storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
|
||||
* . loads eBPF programs via BPF syscall
|
||||
*
|
||||
* One ELF file can contain multiple BPF programs which will be loaded
|
||||
* and their FDs stored stored in prog_fd array
|
||||
*
|
||||
* returns zero on success
|
||||
*/
|
||||
int load_bpf_file(char *path);
|
||||
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
|
||||
|
||||
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user