mirror of
https://github.com/torvalds/linux.git
synced 2024-11-25 13:41:51 +00:00
mm: compaction: Abort async compaction if locks are contended or taking too long
Jim Schutt reported a problem that pointed at compaction contending
heavily on locks. The workload is straight-forward and in his own words;
The systems in question have 24 SAS drives spread across 3 HBAs,
running 24 Ceph OSD instances, one per drive. FWIW these servers
are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160
Ceph Linux clients doing dd simultaneously to a Ceph file system
backed by 12 of these servers.
Early in the test everything looks fine
procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
r b swpd free buff cache si so bi bo in cs us sy id wa st
31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0
27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0
28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0
6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0
22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0
and then it goes to pot
procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu-------
r b swpd free buff cache si so bi bo in cs us sy id wa st
163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0
207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0
123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0
123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0
622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0
223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0
Note that system CPU usage is very high blocks being written out has
dropped by 42%. He analysed this with perf and found
perf record -g -a sleep 10
perf report --sort symbol --call-graph fractal,5
34.63% [k] _raw_spin_lock_irqsave
|
|--97.30%-- isolate_freepages
| compaction_alloc
| unmap_and_move
| migrate_pages
| compact_zone
| compact_zone_order
| try_to_compact_pages
| __alloc_pages_direct_compact
| __alloc_pages_slowpath
| __alloc_pages_nodemask
| alloc_pages_vma
| do_huge_pmd_anonymous_page
| handle_mm_fault
| do_page_fault
| page_fault
| |
| |--87.39%-- skb_copy_datagram_iovec
| | tcp_recvmsg
| | inet_recvmsg
| | sock_recvmsg
| | sys_recvfrom
| | system_call
| | __recv
| | |
| | --100.00%-- (nil)
| |
| --12.61%-- memcpy
--2.70%-- [...]
There was other data but primarily it is all showing that compaction is
contended heavily on the zone->lock and zone->lru_lock.
commit [b2eef8c0
: mm: compaction: minimise the time IRQs are disabled
while isolating pages for migration] noted that it was possible for
migration to hold the lru_lock for an excessive amount of time. Very
broadly speaking this patch expands the concept.
This patch introduces compact_checklock_irqsave() to check if a lock
is contended or the process needs to be scheduled. If either condition
is true then async compaction is aborted and the caller is informed.
The page allocator will fail a THP allocation if compaction failed due
to contention. This patch also introduces compact_trylock_irqsave()
which will acquire the lock only if it is not contended and the process
does not need to schedule.
Reported-by: Jim Schutt <jaschut@sandia.gov>
Tested-by: Jim Schutt <jaschut@sandia.gov>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
de74f1cc3b
commit
c67fe3752a
@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
|
||||
extern int fragmentation_index(struct zone *zone, unsigned int order);
|
||||
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int order, gfp_t gfp_mask, nodemask_t *mask,
|
||||
bool sync);
|
||||
bool sync, bool *contended);
|
||||
extern int compact_pgdat(pg_data_t *pgdat, int order);
|
||||
extern unsigned long compaction_suitable(struct zone *zone, int order);
|
||||
|
||||
@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
|
||||
#else
|
||||
static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int order, gfp_t gfp_mask, nodemask_t *nodemask,
|
||||
bool sync)
|
||||
bool sync, bool *contended)
|
||||
{
|
||||
return COMPACT_CONTINUE;
|
||||
}
|
||||
|
100
mm/compaction.c
100
mm/compaction.c
@ -50,6 +50,47 @@ static inline bool migrate_async_suitable(int migratetype)
|
||||
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compaction requires the taking of some coarse locks that are potentially
|
||||
* very heavily contended. Check if the process needs to be scheduled or
|
||||
* if the lock is contended. For async compaction, back out in the event
|
||||
* if contention is severe. For sync compaction, schedule.
|
||||
*
|
||||
* Returns true if the lock is held.
|
||||
* Returns false if the lock is released and compaction should abort
|
||||
*/
|
||||
static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
|
||||
bool locked, struct compact_control *cc)
|
||||
{
|
||||
if (need_resched() || spin_is_contended(lock)) {
|
||||
if (locked) {
|
||||
spin_unlock_irqrestore(lock, *flags);
|
||||
locked = false;
|
||||
}
|
||||
|
||||
/* async aborts if taking too long or contended */
|
||||
if (!cc->sync) {
|
||||
if (cc->contended)
|
||||
*cc->contended = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
if (fatal_signal_pending(current))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!locked)
|
||||
spin_lock_irqsave(lock, *flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool compact_trylock_irqsave(spinlock_t *lock,
|
||||
unsigned long *flags, struct compact_control *cc)
|
||||
{
|
||||
return compact_checklock_irqsave(lock, flags, false, cc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
|
||||
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
|
||||
@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
|
||||
}
|
||||
|
||||
/* Update the number of anon and file isolated pages in the zone */
|
||||
static void acct_isolated(struct zone *zone, struct compact_control *cc)
|
||||
static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int count[2] = { 0, };
|
||||
@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
|
||||
list_for_each_entry(page, &cc->migratepages, lru)
|
||||
count[!!page_is_file_cache(page)]++;
|
||||
|
||||
__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
|
||||
__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
|
||||
/* If locked we can use the interrupt unsafe versions */
|
||||
if (locked) {
|
||||
__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
|
||||
__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
|
||||
} else {
|
||||
mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
|
||||
mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Similar to reclaim, but different enough that they don't share logic */
|
||||
@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
|
||||
struct list_head *migratelist = &cc->migratepages;
|
||||
isolate_mode_t mode = 0;
|
||||
struct lruvec *lruvec;
|
||||
unsigned long flags;
|
||||
bool locked;
|
||||
|
||||
/*
|
||||
* Ensure that there are not too many pages isolated from the LRU
|
||||
@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
|
||||
|
||||
/* Time to isolate some pages for migration */
|
||||
cond_resched();
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
spin_lock_irqsave(&zone->lru_lock, flags);
|
||||
locked = true;
|
||||
for (; low_pfn < end_pfn; low_pfn++) {
|
||||
struct page *page;
|
||||
bool locked = true;
|
||||
|
||||
/* give a chance to irqs before checking need_resched() */
|
||||
if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
locked = false;
|
||||
}
|
||||
if (need_resched() || spin_is_contended(&zone->lru_lock)) {
|
||||
if (locked)
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
cond_resched();
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
if (fatal_signal_pending(current))
|
||||
break;
|
||||
} else if (!locked)
|
||||
spin_lock_irq(&zone->lru_lock);
|
||||
|
||||
/* Check if it is ok to still hold the lock */
|
||||
locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
|
||||
locked, cc);
|
||||
if (!locked)
|
||||
break;
|
||||
|
||||
/*
|
||||
* migrate_pfn does not necessarily start aligned to a
|
||||
@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
|
||||
}
|
||||
}
|
||||
|
||||
acct_isolated(zone, cc);
|
||||
acct_isolated(zone, locked, cc);
|
||||
|
||||
spin_unlock_irq(&zone->lru_lock);
|
||||
if (locked)
|
||||
spin_unlock_irqrestore(&zone->lru_lock, flags);
|
||||
|
||||
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
|
||||
|
||||
@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
|
||||
* are disabled
|
||||
*/
|
||||
isolated = 0;
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
|
||||
/*
|
||||
* The zone lock must be held to isolate freepages. This
|
||||
* unfortunately this is a very coarse lock and can be
|
||||
* heavily contended if there are parallel allocations
|
||||
* or parallel compactions. For async compaction do not
|
||||
* spin on the lock
|
||||
*/
|
||||
if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
|
||||
break;
|
||||
if (suitable_migration_target(page)) {
|
||||
end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
|
||||
isolated = isolate_freepages_block(pfn, end_pfn,
|
||||
@ -773,7 +829,7 @@ out:
|
||||
|
||||
static unsigned long compact_zone_order(struct zone *zone,
|
||||
int order, gfp_t gfp_mask,
|
||||
bool sync)
|
||||
bool sync, bool *contended)
|
||||
{
|
||||
struct compact_control cc = {
|
||||
.nr_freepages = 0,
|
||||
@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
|
||||
.migratetype = allocflags_to_migratetype(gfp_mask),
|
||||
.zone = zone,
|
||||
.sync = sync,
|
||||
.contended = contended,
|
||||
};
|
||||
INIT_LIST_HEAD(&cc.freepages);
|
||||
INIT_LIST_HEAD(&cc.migratepages);
|
||||
@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
|
||||
*/
|
||||
unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int order, gfp_t gfp_mask, nodemask_t *nodemask,
|
||||
bool sync)
|
||||
bool sync, bool *contended)
|
||||
{
|
||||
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
||||
int may_enter_fs = gfp_mask & __GFP_FS;
|
||||
@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
nodemask) {
|
||||
int status;
|
||||
|
||||
status = compact_zone_order(zone, order, gfp_mask, sync);
|
||||
status = compact_zone_order(zone, order, gfp_mask, sync,
|
||||
contended);
|
||||
rc = max(status, rc);
|
||||
|
||||
/* If a normal allocation would succeed, stop compacting */
|
||||
|
@ -130,6 +130,7 @@ struct compact_control {
|
||||
int order; /* order a direct compactor needs */
|
||||
int migratetype; /* MOVABLE, RECLAIMABLE etc */
|
||||
struct zone *zone;
|
||||
bool *contended; /* True if a lock was contended */
|
||||
};
|
||||
|
||||
unsigned long
|
||||
|
@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
struct zonelist *zonelist, enum zone_type high_zoneidx,
|
||||
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
|
||||
int migratetype, bool sync_migration,
|
||||
bool *deferred_compaction,
|
||||
bool *contended_compaction, bool *deferred_compaction,
|
||||
unsigned long *did_some_progress)
|
||||
{
|
||||
struct page *page;
|
||||
@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
|
||||
current->flags |= PF_MEMALLOC;
|
||||
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
|
||||
nodemask, sync_migration);
|
||||
nodemask, sync_migration,
|
||||
contended_compaction);
|
||||
current->flags &= ~PF_MEMALLOC;
|
||||
if (*did_some_progress != COMPACT_SKIPPED) {
|
||||
|
||||
@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
struct zonelist *zonelist, enum zone_type high_zoneidx,
|
||||
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
|
||||
int migratetype, bool sync_migration,
|
||||
bool *deferred_compaction,
|
||||
bool *contended_compaction, bool *deferred_compaction,
|
||||
unsigned long *did_some_progress)
|
||||
{
|
||||
return NULL;
|
||||
@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
unsigned long did_some_progress;
|
||||
bool sync_migration = false;
|
||||
bool deferred_compaction = false;
|
||||
bool contended_compaction = false;
|
||||
|
||||
/*
|
||||
* In the slowpath, we sanity check order to avoid ever trying to
|
||||
@ -2425,6 +2427,7 @@ rebalance:
|
||||
nodemask,
|
||||
alloc_flags, preferred_zone,
|
||||
migratetype, sync_migration,
|
||||
&contended_compaction,
|
||||
&deferred_compaction,
|
||||
&did_some_progress);
|
||||
if (page)
|
||||
@ -2434,10 +2437,11 @@ rebalance:
|
||||
/*
|
||||
* If compaction is deferred for high-order allocations, it is because
|
||||
* sync compaction recently failed. In this is the case and the caller
|
||||
* has requested the system not be heavily disrupted, fail the
|
||||
* allocation now instead of entering direct reclaim
|
||||
* requested a movable allocation that does not heavily disrupt the
|
||||
* system then fail the allocation instead of entering direct reclaim.
|
||||
*/
|
||||
if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
|
||||
if ((deferred_compaction || contended_compaction) &&
|
||||
(gfp_mask & __GFP_NO_KSWAPD))
|
||||
goto nopage;
|
||||
|
||||
/* Try direct reclaim and then allocating */
|
||||
@ -2508,6 +2512,7 @@ rebalance:
|
||||
nodemask,
|
||||
alloc_flags, preferred_zone,
|
||||
migratetype, sync_migration,
|
||||
&contended_compaction,
|
||||
&deferred_compaction,
|
||||
&did_some_progress);
|
||||
if (page)
|
||||
|
Loading…
Reference in New Issue
Block a user