diff --git a/mm/internal.h b/mm/internal.h index 29e1e761f9eb..3e910000fda4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73c25912c7c4..15d140755e71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1584,12 +1575,7 @@ again: get_pageblock_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1955,23 +1941,12 @@ zonelist_scan: * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; } /* * When allocating a page cache page for writing, we @@ -2409,7 +2384,29 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + /* + * Only reset the batches of zones that were actually + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (!zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + } +} + +static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, struct zone *preferred_zone) @@ -2417,22 +2414,8 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); - /* - * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); - } + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } static inline int @@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2753,11 +2737,28 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) { + /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not