mirror of
https://github.com/torvalds/linux.git
synced 2024-12-20 18:11:47 +00:00
6a5bcd84e8
Up to now several high speed NICs have custom mechanisms of recycling the allocated memory they use for their payloads. Our page_pool API already has recycling capabilities that are always used when we are running in 'XDP mode'. So let's tweak the API and the kernel network stack slightly and allow the recycling to happen even during the standard operation. The API doesn't take into account 'split page' policies used by those drivers currently, but can be extended once we have users for that. The idea is to be able to intercept the packet on skb_release_data(). If it's a buffer coming from our page_pool API recycle it back to the pool for further usage or just release the packet entirely. To achieve that we introduce a bit in struct sk_buff (pp_recycle:1) and a field in struct page (page->pp) to store the page_pool pointer. Storing the information in page->pp allows us to recycle both SKBs and their fragments. We could have skipped the skb bit entirely, since identical information can bederived from struct page. However, in an effort to affect the free path as less as possible, reading a single bit in the skb which is already in cache, is better that trying to derive identical information for the page stored data. The driver or page_pool has to take care of the sync operations on it's own during the buffer recycling since the buffer is, after opting-in to the recycling, never unmapped. Since the gain on the drivers depends on the architecture, we are not enabling recycling by default if the page_pool API is used on a driver. In order to enable recycling the driver must call skb_mark_for_recycle() to store the information we need for recycling in page->pp and enabling the recycling bit, or page_pool_store_mem_info() for a fragment. Co-developed-by: Jesper Dangaard Brouer <brouer@redhat.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Co-developed-by: Matteo Croce <mcroce@microsoft.com> Signed-off-by: Matteo Croce <mcroce@microsoft.com> Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Signed-off-by: David S. Miller <davem@davemloft.net>
264 lines
7.8 KiB
C
264 lines
7.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0
|
|
*
|
|
* page_pool.h
|
|
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
|
*/
|
|
|
|
/**
|
|
* DOC: page_pool allocator
|
|
*
|
|
* This page_pool allocator is optimized for the XDP mode that
|
|
* uses one-frame-per-page, but have fallbacks that act like the
|
|
* regular page allocator APIs.
|
|
*
|
|
* Basic use involve replacing alloc_pages() calls with the
|
|
* page_pool_alloc_pages() call. Drivers should likely use
|
|
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
|
|
*
|
|
* API keeps track of in-flight pages, in-order to let API user know
|
|
* when it is safe to dealloactor page_pool object. Thus, API users
|
|
* must make sure to call page_pool_release_page() when a page is
|
|
* "leaving" the page_pool. Or call page_pool_put_page() where
|
|
* appropiate. For maintaining correct accounting.
|
|
*
|
|
* API user must only call page_pool_put_page() once on a page, as it
|
|
* will either recycle the page, or in case of elevated refcnt, it
|
|
* will release the DMA mapping and in-flight state accounting. We
|
|
* hope to lift this requirement in the future.
|
|
*/
|
|
#ifndef _NET_PAGE_POOL_H
|
|
#define _NET_PAGE_POOL_H
|
|
|
|
#include <linux/mm.h> /* Needed by ptr_ring */
|
|
#include <linux/ptr_ring.h>
|
|
#include <linux/dma-direction.h>
|
|
|
|
#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA
|
|
* map/unmap
|
|
*/
|
|
#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets
|
|
* from page_pool will be
|
|
* DMA-synced-for-device according to
|
|
* the length provided by the device
|
|
* driver.
|
|
* Please note DMA-sync-for-CPU is still
|
|
* device driver responsibility
|
|
*/
|
|
#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
|
|
|
|
/*
|
|
* Fast allocation side cache array/stack
|
|
*
|
|
* The cache size and refill watermark is related to the network
|
|
* use-case. The NAPI budget is 64 packets. After a NAPI poll the RX
|
|
* ring is usually refilled and the max consumed elements will be 64,
|
|
* thus a natural max size of objects needed in the cache.
|
|
*
|
|
* Keeping room for more objects, is due to XDP_DROP use-case. As
|
|
* XDP_DROP allows the opportunity to recycle objects directly into
|
|
* this array, as it shares the same softirq/NAPI protection. If
|
|
* cache is already full (or partly full) then the XDP_DROP recycles
|
|
* would have to take a slower code path.
|
|
*/
|
|
#define PP_ALLOC_CACHE_SIZE 128
|
|
#define PP_ALLOC_CACHE_REFILL 64
|
|
struct pp_alloc_cache {
|
|
u32 count;
|
|
struct page *cache[PP_ALLOC_CACHE_SIZE];
|
|
};
|
|
|
|
struct page_pool_params {
|
|
unsigned int flags;
|
|
unsigned int order;
|
|
unsigned int pool_size;
|
|
int nid; /* Numa node id to allocate from pages from */
|
|
struct device *dev; /* device, for DMA pre-mapping purposes */
|
|
enum dma_data_direction dma_dir; /* DMA mapping direction */
|
|
unsigned int max_len; /* max DMA sync memory size */
|
|
unsigned int offset; /* DMA addr offset */
|
|
};
|
|
|
|
struct page_pool {
|
|
struct page_pool_params p;
|
|
|
|
struct delayed_work release_dw;
|
|
void (*disconnect)(void *);
|
|
unsigned long defer_start;
|
|
unsigned long defer_warn;
|
|
|
|
u32 pages_state_hold_cnt;
|
|
|
|
/*
|
|
* Data structure for allocation side
|
|
*
|
|
* Drivers allocation side usually already perform some kind
|
|
* of resource protection. Piggyback on this protection, and
|
|
* require driver to protect allocation side.
|
|
*
|
|
* For NIC drivers this means, allocate a page_pool per
|
|
* RX-queue. As the RX-queue is already protected by
|
|
* Softirq/BH scheduling and napi_schedule. NAPI schedule
|
|
* guarantee that a single napi_struct will only be scheduled
|
|
* on a single CPU (see napi_schedule).
|
|
*/
|
|
struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
|
|
|
|
/* Data structure for storing recycled pages.
|
|
*
|
|
* Returning/freeing pages is more complicated synchronization
|
|
* wise, because free's can happen on remote CPUs, with no
|
|
* association with allocation resource.
|
|
*
|
|
* Use ptr_ring, as it separates consumer and producer
|
|
* effeciently, it a way that doesn't bounce cache-lines.
|
|
*
|
|
* TODO: Implement bulk return pages into this structure.
|
|
*/
|
|
struct ptr_ring ring;
|
|
|
|
atomic_t pages_state_release_cnt;
|
|
|
|
/* A page_pool is strictly tied to a single RX-queue being
|
|
* protected by NAPI, due to above pp_alloc_cache. This
|
|
* refcnt serves purpose is to simplify drivers error handling.
|
|
*/
|
|
refcount_t user_cnt;
|
|
|
|
u64 destroy_cnt;
|
|
};
|
|
|
|
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
|
|
|
|
static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
|
|
{
|
|
gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
return page_pool_alloc_pages(pool, gfp);
|
|
}
|
|
|
|
/* get the stored dma direction. A driver might decide to treat this locally and
|
|
* avoid the extra cache line from page_pool to determine the direction
|
|
*/
|
|
static
|
|
inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
|
|
{
|
|
return pool->p.dma_dir;
|
|
}
|
|
|
|
bool page_pool_return_skb_page(struct page *page);
|
|
|
|
struct page_pool *page_pool_create(const struct page_pool_params *params);
|
|
|
|
#ifdef CONFIG_PAGE_POOL
|
|
void page_pool_destroy(struct page_pool *pool);
|
|
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *));
|
|
void page_pool_release_page(struct page_pool *pool, struct page *page);
|
|
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
|
|
int count);
|
|
#else
|
|
static inline void page_pool_destroy(struct page_pool *pool)
|
|
{
|
|
}
|
|
|
|
static inline void page_pool_use_xdp_mem(struct page_pool *pool,
|
|
void (*disconnect)(void *))
|
|
{
|
|
}
|
|
static inline void page_pool_release_page(struct page_pool *pool,
|
|
struct page *page)
|
|
{
|
|
}
|
|
|
|
static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data,
|
|
int count)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
void page_pool_put_page(struct page_pool *pool, struct page *page,
|
|
unsigned int dma_sync_size, bool allow_direct);
|
|
|
|
/* Same as above but will try to sync the entire area pool->max_len */
|
|
static inline void page_pool_put_full_page(struct page_pool *pool,
|
|
struct page *page, bool allow_direct)
|
|
{
|
|
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
|
|
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
|
|
*/
|
|
#ifdef CONFIG_PAGE_POOL
|
|
page_pool_put_page(pool, page, -1, allow_direct);
|
|
#endif
|
|
}
|
|
|
|
/* Same as above but the caller must guarantee safe context. e.g NAPI */
|
|
static inline void page_pool_recycle_direct(struct page_pool *pool,
|
|
struct page *page)
|
|
{
|
|
page_pool_put_full_page(pool, page, true);
|
|
}
|
|
|
|
static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
|
|
{
|
|
dma_addr_t ret = page->dma_addr[0];
|
|
if (sizeof(dma_addr_t) > sizeof(unsigned long))
|
|
ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
|
|
return ret;
|
|
}
|
|
|
|
static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
|
|
{
|
|
page->dma_addr[0] = addr;
|
|
if (sizeof(dma_addr_t) > sizeof(unsigned long))
|
|
page->dma_addr[1] = upper_32_bits(addr);
|
|
}
|
|
|
|
static inline bool is_page_pool_compiled_in(void)
|
|
{
|
|
#ifdef CONFIG_PAGE_POOL
|
|
return true;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
static inline bool page_pool_put(struct page_pool *pool)
|
|
{
|
|
return refcount_dec_and_test(&pool->user_cnt);
|
|
}
|
|
|
|
/* Caller must provide appropriate safe context, e.g. NAPI. */
|
|
void page_pool_update_nid(struct page_pool *pool, int new_nid);
|
|
static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
|
|
{
|
|
if (unlikely(pool->p.nid != new_nid))
|
|
page_pool_update_nid(pool, new_nid);
|
|
}
|
|
|
|
static inline void page_pool_ring_lock(struct page_pool *pool)
|
|
__acquires(&pool->ring.producer_lock)
|
|
{
|
|
if (in_serving_softirq())
|
|
spin_lock(&pool->ring.producer_lock);
|
|
else
|
|
spin_lock_bh(&pool->ring.producer_lock);
|
|
}
|
|
|
|
static inline void page_pool_ring_unlock(struct page_pool *pool)
|
|
__releases(&pool->ring.producer_lock)
|
|
{
|
|
if (in_serving_softirq())
|
|
spin_unlock(&pool->ring.producer_lock);
|
|
else
|
|
spin_unlock_bh(&pool->ring.producer_lock);
|
|
}
|
|
|
|
/* Store mem_info on struct page and use it while recycling skb frags */
|
|
static inline
|
|
void page_pool_store_mem_info(struct page *page, struct page_pool *pp)
|
|
{
|
|
page->pp = pp;
|
|
}
|
|
|
|
#endif /* _NET_PAGE_POOL_H */
|