From 9b209e557d698f24317d2619d8628b98b6728e4f Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 26 May 2022 19:15:30 -0400 Subject: [PATCH 01/25] mm: page-isolation: skip isolated pageblock in start_isolate_page_range() start_isolate_page_range() first isolates the first and the last pageblocks in the range and ensure pages across range boundaries are split during isolation. But it missed the case when the range is <= a pageblock and the first and the last pageblocks are the same one, so the second isolate_single_pageblock() will always fail. To fix it, skip the pageblock isolation in second isolate_single_pageblock(). Link: https://lkml.kernel.org/r/20220526231531.2404977-1-zi.yan@sent.com Fixes: 88ee134320b8 ("mm: fix a potential infinite loop in start_isolate_page_range()") Signed-off-by: Zi Yan Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Link: https://lore.kernel.org/linux-mm/ac65adc0-a7e4-cdfe-a0d8-757195b86293@samsung.com/ Reported-by: Michael Walle Tested-by: Michael Walle Link: https://lore.kernel.org/linux-mm/8ca048ca8b547e0dd1c95387ee05c23d@walle.cc/ Cc: Christophe Leroy Cc: David Hildenbrand Cc: Doug Berger Cc: Eric Ren Cc: Mel Gorman Cc: Mike Rapoport Cc: Oscar Salvador Cc: Qian Cai Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_isolation.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c643c8420809..fbd820b21292 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before) + gfp_t gfp_flags, bool isolate_before, bool skip_isolation) { unsigned char saved_mt; unsigned long start_pfn; @@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); - if (ret) - return ret; + if (skip_isolation) + VM_BUG_ON(!is_migrate_isolate(saved_mt)); + else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, + isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + + if (ret) + return ret; + } /* * Bail out early when the to-be-isolated pageblock does not form @@ -463,7 +468,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, return 0; failed: /* restore the original migratetype */ - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); + if (!skip_isolation) + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); return -EBUSY; } @@ -522,14 +528,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; + bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); if (ret) return ret; + if (isolate_start == isolate_end - pageblock_nr_pages) + skip_isolation = true; + /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; From 86d28b0709279ccc636ef9ba267b7f3bcef79a4b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 26 May 2022 19:15:31 -0400 Subject: [PATCH 02/25] mm: split free page with properly free memory accounting and without race In isolate_single_pageblock(), free pages are checked without holding zone lock, but they can go away in split_free_page() when zone lock is held. Check the free page and its order again in split_free_page() when zone lock is held. Recheck the page if the free page is gone under zone lock. In addition, in split_free_page(), the free page was deleted from the page list without changing free page accounting. Add the missing free page accounting code. Fix the type of order parameter in split_free_page(). Link: https://lore.kernel.org/lkml/20220525103621.987185e2ca0079f7b97b856d@linux-foundation.org/ Link: https://lkml.kernel.org/r/20220526231531.2404977-2-zi.yan@sent.com Fixes: b2c9e2fbba32 ("mm: make alloc_contig_range work at pageblock granularity") Signed-off-by: Zi Yan Reported-by: Doug Berger Link: https://lore.kernel.org/linux-mm/c3932a6f-77fe-29f7-0c29-fe6b1c67ab7b@gmail.com/ Cc: David Hildenbrand Cc: Qian Cai Cc: Vlastimil Babka Cc: Mel Gorman Cc: Eric Ren Cc: Mike Rapoport Cc: Oscar Salvador Cc: Christophe Leroy Cc: Marek Szyprowski Cc: Michael Walle Signed-off-by: Andrew Morton --- mm/internal.h | 4 ++-- mm/page_alloc.c | 24 ++++++++++++++++++++---- mm/page_isolation.c | 10 +++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 64e61b032dac..c0f8fbe0445b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset); +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc93a82e51e6..6f6e4649ac21 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1100,30 +1100,44 @@ done_merging: * @order: the order of the page * @split_pfn_offset: split offset within the page * + * Return -ENOENT if the free page is changed, otherwise 0 + * * It is used when the free page crosses two pageblocks with different migratetypes * at split_pfn_offset within the page. The split free page will be put into * separate migratetype lists afterwards. Otherwise, the function achieves * nothing. */ -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset) +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset) { struct zone *zone = page_zone(free_page); unsigned long free_page_pfn = page_to_pfn(free_page); unsigned long pfn; unsigned long flags; int free_page_order; + int mt; + int ret = 0; if (split_pfn_offset == 0) - return; + return ret; spin_lock_irqsave(&zone->lock, flags); + + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { + ret = -ENOENT; + goto out; + } + + mt = get_pageblock_migratetype(free_page); + if (likely(!is_migrate_isolate(mt))) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(free_page, zone, order); for (pfn = free_page_pfn; pfn < free_page_pfn + (1UL << order);) { int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - free_page_order = min_t(int, + free_page_order = min_t(unsigned int, pfn ? __ffs(pfn) : order, __fls(split_pfn_offset)); __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, @@ -1134,7 +1148,9 @@ void split_free_page(struct page *free_page, if (split_pfn_offset == 0) split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); } +out: spin_unlock_irqrestore(&zone->lock, flags); + return ret; } /* * A bad page could be due to a number of fields. Instead of multiple branches, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fbd820b21292..6021f8444b5a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -371,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) - split_free_page(page, order, boundary_pfn - pfn); - pfn += (1UL << order); + if (pfn + (1UL << order) > boundary_pfn) { + /* free page changed before split, check it again */ + if (split_free_page(page, order, boundary_pfn - pfn)) + continue; + } + + pfn += 1UL << order; continue; } /* From 4c6bdb36408fbe4697eb9f1d95a3ddc1d2348448 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 03/25] mm/z3fold: fix sheduling while atomic Patch series "A few fixup patches for z3fold". This series contains a few fixup patches to fix sheduling while atomic, fix possible null pointer dereferencing, fix various race conditions and so on. More details can be found in the respective changelogs. This patch (of 9): z3fold's page_lock is always held when calling alloc_slots. So gfp should be GFP_ATOMIC to avoid "scheduling while atomic" bug. Link: https://lkml.kernel.org/r/20220429064051.61552-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220429064051.61552-2-linmiaohe@huawei.com Fixes: fc5488651c7d ("z3fold: simplify freeing slots") Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 83b5a3514427..c2260f5a5885 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -941,8 +941,7 @@ lookup: } if (zhdr && !zhdr->slots) - zhdr->slots = alloc_slots(pool, - can_sleep ? GFP_NOIO : GFP_ATOMIC); + zhdr->slots = alloc_slots(pool, GFP_ATOMIC); return zhdr; } From 7c61c35bbd6a888416e5a6de8ff8782a70d013d4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 04/25] mm/z3fold: fix possible null pointer dereferencing alloc_slots could fail to allocate memory under heavy memory pressure. So we should check zhdr->slots against NULL to avoid future null pointer dereferencing. Link: https://lkml.kernel.org/r/20220429064051.61552-3-linmiaohe@huawei.com Fixes: fc5488651c7d ("z3fold: simplify freeing slots") Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index c2260f5a5885..5d8c21f2bc59 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -940,9 +940,19 @@ lookup: } } - if (zhdr && !zhdr->slots) + if (zhdr && !zhdr->slots) { zhdr->slots = alloc_slots(pool, GFP_ATOMIC); + if (!zhdr->slots) + goto out_fail; + } return zhdr; + +out_fail: + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, zhdr); + z3fold_page_unlock(zhdr); + } + return NULL; } /* From df6f0f1d0cf091947bb621cfdada4c82c1f05c4b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 05/25] mm/z3fold: remove buggy use of stale list for allocation Currently if z3fold couldn't find an unbuddied page it would first try to pull a page off the stale list. But this approach is problematic. If init z3fold page fails later, the page should be freed via free_z3fold_page to clean up the relevant resource instead of using __free_page directly. And if page is successfully reused, it will BUG_ON later in __SetPageMovable because it's already non-lru movable page, i.e. PAGE_MAPPING_MOVABLE is already set in page->mapping. In order to fix all of these issues, we can simply remove the buggy use of stale list for allocation because can_sleep should always be false and we never really hit the reusing code path now. Link: https://lkml.kernel.org/r/20220429064051.61552-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 5d8c21f2bc59..4e6814c5694f 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1102,28 +1102,7 @@ retry: bud = FIRST; } - page = NULL; - if (can_sleep) { - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from - * the stale pages list. cancel_work_sync() can sleep so we - * limit this case to the contexts where we can sleep - */ - if (zhdr) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - } - } - if (!page) - page = alloc_page(gfp); - + page = alloc_page(gfp); if (!page) return -ENOMEM; From 2c0f351434785626beb5cb49962b4e873459fd38 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 06/25] mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc If trylock_page fails, the page won't be non-lru movable page. When this page is freed via free_z3fold_page, it will trigger bug on PageMovable check in __ClearPageMovable. Throw warning on failure of trylock_page to guard against such rare case just as what zsmalloc does. Link: https://lkml.kernel.org/r/20220429064051.61552-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 4e6814c5694f..b3b4e65c107f 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1122,10 +1122,9 @@ retry: __SetPageMovable(page, pool->inode->i_mapping); unlock_page(page); } else { - if (trylock_page(page)) { - __SetPageMovable(page, pool->inode->i_mapping); - unlock_page(page); - } + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); } z3fold_page_lock(zhdr); From f4bad643c1d602f7154cf0d8eeb509136a55dccb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 07/25] revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc" Revert commit f1549cb5ab2b ("mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"). z3fold can't support GFP_HIGHMEM page now. page_address is used directly at all places. Moreover, z3fold_header is on per cpu unbuddied list which could be accessed anytime. So we should remove the support of GFP_HIGHMEM allocation for z3fold. Link: https://lkml.kernel.org/r/20220429064051.61552-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index b3b4e65c107f..5f5d5f1556be 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -212,10 +212,8 @@ static int size_to_chunks(size_t size) static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { - struct z3fold_buddy_slots *slots; - - slots = kmem_cache_zalloc(pool->c_handle, - (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); + struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, + gfp); if (slots) { /* It will be freed separately in free_handle(). */ @@ -1075,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, enum buddy bud; bool can_sleep = gfpflags_allow_blocking(gfp); - if (!size) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) From 6cf9a34967ed544ca4c0949e9928dc78fca57ef3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 08/25] mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails When doing z3fold page reclaim or migration, the page is removed from unbuddied list. If reclaim or migration succeeds, it's fine as page is released. But in case it fails, the page is not put back into unbuddied list now. The page will be leaked until next compaction work, reclaim or migration is done. Link: https://lkml.kernel.org/r/20220429064051.61552-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index 5f5d5f1556be..a1c150fc8def 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1422,6 +1422,8 @@ next: spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); clear_bit(PAGE_CLAIMED, &page->private); } @@ -1638,6 +1640,8 @@ static void z3fold_page_putback(struct page *page) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } From 4a1c3839108afcfec02f4d62d6862b2451b442ab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 09/25] mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock Think about the below race window: CPU1 CPU2 z3fold_reclaim_page z3fold_free test_and_set_bit PAGE_CLAIMED failed to reclaim page z3fold_page_lock(zhdr); add back to the lru list; z3fold_page_unlock(zhdr); get_z3fold_header page_claimed=test_and_set_bit PAGE_CLAIMED clear_bit(PAGE_CLAIMED, &page->private); if (!page_claimed) /* it's false true */ free_handle is not called free_handle won't be called in this case. So z3fold_buddy_slots will leak. Fix it by always clear PAGE_CLAIMED under z3fold page lock. Link: https://lkml.kernel.org/r/20220429064051.61552-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index a1c150fc8def..4a3cd2ff15b0 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1221,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + put_z3fold_header(zhdr); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { @@ -1424,8 +1424,8 @@ next: spin_unlock(&pool->lock); if (list_empty(&zhdr->buddy)) add_to_unbuddied(pool, zhdr); - z3fold_page_unlock(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); } /* We started off locked to we need to lock the pool back */ @@ -1577,8 +1577,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa if (!z3fold_page_trylock(zhdr)) return -EAGAIN; if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { - z3fold_page_unlock(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); return -EBUSY; } if (work_pending(&zhdr->work)) { From 04094226d6ce8c0cb590891e13872109aa6722f1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 10/25] mm/z3fold: fix z3fold_reclaim_page races with z3fold_free Think about the below scenario: CPU1 CPU2 z3fold_reclaim_page z3fold_free spin_lock(&pool->lock) get_z3fold_header -- hold page_lock kref_get_unless_zero kref_put--zhdr->refcount can be 1 now !z3fold_page_trylock kref_put -- zhdr->refcount is 0 now release_z3fold_page WARN_ON(!list_empty(&zhdr->buddy)); -- we're on buddy now! spin_lock(&pool->lock); -- deadlock here! z3fold_reclaim_page might race with z3fold_free and will lead to pool lock deadlock and zhdr buddy non-empty warning. To fix this, defer getting the refcount until page_lock is held just like what __z3fold_alloc does. Note this has the side effect that we won't break the reclaim if we meet a soon to be released z3fold page now. Link: https://lkml.kernel.org/r/20220429064051.61552-9-linmiaohe@huawei.com Fixes: dcf5aedb24f8 ("z3fold: stricter locking and more careful reclaim") Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 4a3cd2ff15b0..a7769befd74e 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -519,13 +519,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) atomic64_dec(&pool->pages_nr); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, - refcount); - __release_z3fold_page(zhdr, false); -} - static void release_z3fold_page_locked(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, @@ -1317,12 +1310,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) break; } - if (kref_get_unless_zero(&zhdr->refcount) == 0) { - zhdr = NULL; - break; - } if (!z3fold_page_trylock(zhdr)) { - kref_put(&zhdr->refcount, release_z3fold_page); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1333,14 +1321,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) */ if (zhdr->foreign_handles || test_and_set_bit(PAGE_CLAIMED, &page->private)) { - if (!kref_put(&zhdr->refcount, - release_z3fold_page_locked)) - z3fold_page_unlock(zhdr); + z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ } list_del_init(&zhdr->buddy); zhdr->cpu = -1; + /* See comment in __z3fold_alloc. */ + kref_get(&zhdr->refcount); break; } From 943fb61dd66f475c25b1ef5dddb647070f2e89a1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 14:40:43 +0800 Subject: [PATCH 11/25] mm/z3fold: fix z3fold_page_migrate races with z3fold_map Think about the below scenario: CPU1 CPU2 z3fold_page_migrate z3fold_map z3fold_page_trylock ... z3fold_page_unlock /* slots still points to old zhdr*/ get_z3fold_header get slots from handle get old zhdr from slots z3fold_page_trylock return *old* zhdr encode_handle(new_zhdr, FIRST|LAST|MIDDLE) put_page(page) /* zhdr is freed! */ but zhdr is still used by caller! z3fold_map can map freed z3fold page and lead to use-after-free bug. To fix it, we add PAGE_MIGRATED to indicate z3fold page is migrated and soon to be released. So get_z3fold_header won't return such page. Link: https://lkml.kernel.org/r/20220429064051.61552-10-linmiaohe@huawei.com Fixes: 1f862989b04a ("mm/z3fold.c: support page migration") Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index a7769befd74e..f41f8b0d9e9a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -181,6 +181,7 @@ enum z3fold_page_flags { NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ + PAGE_MIGRATED, /* page is migrated and soon to be released */ }; /* @@ -270,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle) zhdr = (struct z3fold_header *)(addr & PAGE_MASK); locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); - if (locked) - break; + if (locked) { + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_MIGRATED, &page->private)) + break; + z3fold_page_unlock(zhdr); + } cpu_relax(); } while (true); } else { @@ -389,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + clear_bit(PAGE_MIGRATED, &page->private); if (headless) return zhdr; @@ -1576,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa new_zhdr = page_address(newpage); memcpy(new_zhdr, zhdr, PAGE_SIZE); newpage->private = page->private; - page->private = 0; + set_bit(PAGE_MIGRATED, &page->private); z3fold_page_unlock(zhdr); spin_lock_init(&new_zhdr->page_lock); INIT_WORK(&new_zhdr->work, compact_page_work); @@ -1606,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - clear_bit(PAGE_CLAIMED, &page->private); + /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ + page->private = 0; put_page(page); return 0; } From ff3b72a5d614702ec119f107bddd99cdad622b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 18 May 2022 18:18:55 +0200 Subject: [PATCH 12/25] selftests: memcg: fix compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memcontrol selftests fixups", v2. Flushing the patches to make memcontrol selftests check the events behavior we had consensus about (test_memcg_low fails). (test_memcg_reclaim, test_memcg_swap_max fail for me now but it's present even before the refactoring.) The two bigger changes are: - adjustment of the protected values to make tests succeed with the given tolerance, - both test_memcg_low and test_memcg_min check protection of memory in populated cgroups (actually as per Documentation/admin-guide/cgroup-v2.rst memory.min should not apply to empty cgroups, which is not the case currently. Therefore I unified tests with the populated case in order to to bring more broken tests). This patch (of 5): This fixes mis-applied changes from commit 72b1e03aa725 ("cgroup: account for memory_localevents in test_memcg_oom_group_leaf_events()"). Link: https://lkml.kernel.org/r/20220518161859.21565-1-mkoutny@suse.com Link: https://lkml.kernel.org/r/20220518161859.21565-2-mkoutny@suse.com Signed-off-by: Michal Koutný Reviewed-by: David Vernet Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Richard Palethorpe Cc: Shakeel Butt Signed-off-by: Andrew Morton --- .../selftests/cgroup/test_memcontrol.c | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 44a974ec472c..9b8213479b8b 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1241,7 +1241,16 @@ static int test_memcg_oom_group_leaf_events(const char *root) if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) goto cleanup; - if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0) + parent_oom_events = cg_read_key_long( + parent, "memory.events", "oom_kill "); + /* + * If memory_localevents is not enabled (the default), the parent should + * count OOM events in its children groups. Otherwise, it should not + * have observed any events. + */ + if (has_localevents && parent_oom_events != 0) + goto cleanup; + else if (!has_localevents && parent_oom_events <= 0) goto cleanup; ret = KSFT_PASS; @@ -1349,20 +1358,14 @@ static int test_memcg_oom_group_score_events(const char *root) if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; - parent_oom_events = cg_read_key_long( - parent, "memory.events", "oom_kill "); - /* - * If memory_localevents is not enabled (the default), the parent should - * count OOM events in its children groups. Otherwise, it should not - * have observed any events. - */ - if ((has_localevents && parent_oom_events == 0) || - parent_oom_events > 0) - ret = KSFT_PASS; + if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) + goto cleanup; if (kill(safe_pid, SIGKILL)) goto cleanup; + ret = KSFT_PASS; + cleanup: if (memcg) cg_destroy(memcg); From 1d09069f5313f7c35655dc6d405896f748ddc53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 18 May 2022 18:18:56 +0200 Subject: [PATCH 13/25] selftests: memcg: expect no low events in unprotected sibling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is effectively a revert of commit cdc69458a5f3 ("cgroup: account for memory_recursiveprot in test_memcg_low()"). The case test_memcg_low will fail with memory_recursiveprot until resolved in reclaim code. However, this patch preserves the existing helpers and variables for later uses. Link: https://lkml.kernel.org/r/20220518161859.21565-3-mkoutny@suse.com Signed-off-by: Michal Koutný Reviewed-by: David Vernet Cc: Johannes Weiner Cc: Michal Hocko Cc: Richard Palethorpe Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 9b8213479b8b..7514bf7c0c3e 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -528,7 +528,7 @@ static int test_memcg_low(const char *root) } for (i = 0; i < ARRAY_SIZE(children); i++) { - int no_low_events_index = has_recursiveprot ? 2 : 1; + int no_low_events_index = 1; oom = cg_read_key_long(children[i], "memory.events", "oom "); low = cg_read_key_long(children[i], "memory.events", "low "); From f10b6e9a8e6621f6db2acfbf722a6331f3afaa84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 18 May 2022 18:18:57 +0200 Subject: [PATCH 14/25] selftests: memcg: adjust expected reclaim values of protected cgroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The numbers are not easy to derive in a closed form (certainly mere protections ratios do not apply), therefore use a simulation to obtain expected numbers. Link: https://lkml.kernel.org/r/20220518161859.21565-4-mkoutny@suse.com Signed-off-by: Michal Koutný Acked-by: Roman Gushchin Cc: David Vernet Cc: Johannes Weiner Cc: Michal Hocko Cc: Richard Palethorpe Cc: Shakeel Butt Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + .../selftests/cgroup/memcg_protection.m | 89 +++++++++++++++++++ .../selftests/cgroup/test_memcontrol.c | 29 +++--- 3 files changed, 107 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/cgroup/memcg_protection.m diff --git a/MAINTAINERS b/MAINTAINERS index d8c18f80bcf3..36efbe46f9eb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5029,6 +5029,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c diff --git a/tools/testing/selftests/cgroup/memcg_protection.m b/tools/testing/selftests/cgroup/memcg_protection.m new file mode 100644 index 000000000000..051daa3477b6 --- /dev/null +++ b/tools/testing/selftests/cgroup/memcg_protection.m @@ -0,0 +1,89 @@ +% SPDX-License-Identifier: GPL-2.0 +% +% run as: octave-cli memcg_protection.m +% +% This script simulates reclaim protection behavior on a single level of memcg +% hierarchy to illustrate how overcommitted protection spreads among siblings +% (as it depends also on their current consumption). +% +% Simulation assumes siblings consumed the initial amount of memory (w/out +% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated +% same. It simulates only non-low reclaim and assumes all memory.min = 0. +% +% Input configurations +% -------------------- +% E number parent effective protection +% n vector nominal protection of siblings set at the given level (memory.low) +% c vector current consumption -,,- (memory.current) + +% example from testcase (values in GB) +E = 50 / 1024; +n = [75 25 0 500 ] / 1024; +c = [50 50 50 0] / 1024; + +% Reclaim parameters +% ------------------ + +% Minimal reclaim amount (GB) +cluster = 32*4 / 2**20; + +% Reclaim coefficient (think as 0.5^sc->priority) +alpha = .1 + +% Simulation parameters +% --------------------- +epsilon = 1e-7; +timeout = 1000; + +% Simulation loop +% --------------- + +ch = []; +eh = []; +rh = []; + +for t = 1:timeout + % low_usage + u = min(c, n); + siblings = sum(u); + + % effective_protection() + protected = min(n, c); % start with nominal + e = protected * min(1, E / siblings); % normalize overcommit + + % recursive protection + unclaimed = max(0, E - siblings); + parent_overuse = sum(c) - siblings; + if (unclaimed > 0 && parent_overuse > 0) + overuse = max(0, c - protected); + e += unclaimed * (overuse / parent_overuse); + endif + + % get_scan_count() + r = alpha * c; % assume all memory is in a single LRU list + + % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection") + sz = max(e, c); + r .*= (1 - (e+epsilon) ./ (sz+epsilon)); + + % uncomment to debug prints + % e, c, r + + % nothing to reclaim, reached equilibrium + if max(r) < epsilon + break; + endif + + % SWAP_CLUSTER_MAX roundup + r = max(r, (r > epsilon) .* cluster); + % XXX here I do parallel reclaim of all siblings + % in reality reclaim is serialized and each sibling recalculates own residual + c = max(c - r, 0); + + ch = [ch ; c]; + eh = [eh ; e]; + rh = [rh ; r]; +endfor + +t +c, e diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 7514bf7c0c3e..955eee05d60e 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -248,7 +248,7 @@ static int cg_test_proc_killed(const char *cgroup) /* * First, this test creates the following hierarchy: * A memory.min = 50M, memory.max = 200M - * A/B memory.min = 50M, memory.current = 50M + * A/B memory.min = 50M * A/B/C memory.min = 75M, memory.current = 50M * A/B/D memory.min = 25M, memory.current = 50M * A/B/E memory.min = 0, memory.current = 50M @@ -259,10 +259,13 @@ static int cg_test_proc_killed(const char *cgroup) * Then it creates A/G and creates a significant * memory pressure in it. * + * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M - * A/B/C memory.current ~= 33M - * A/B/D memory.current ~= 17M - * A/B/F memory.current ~= 0 + * A/B/C memory.current ~= 29M + * A/B/D memory.current ~= 21M + * A/B/E memory.current ~= 0 + * A/B/F memory.current = 0 + * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is * unprotected memory in A available, and checks @@ -365,10 +368,10 @@ static int test_memcg_min(const char *root) for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); - if (!values_close(c[0], MB(33), 10)) + if (!values_close(c[0], MB(29), 10)) goto cleanup; - if (!values_close(c[1], MB(17), 10)) + if (!values_close(c[1], MB(21), 10)) goto cleanup; if (c[3] != 0) @@ -405,7 +408,7 @@ cleanup: /* * First, this test creates the following hierarchy: * A memory.low = 50M, memory.max = 200M - * A/B memory.low = 50M, memory.current = 50M + * A/B memory.low = 50M * A/B/C memory.low = 75M, memory.current = 50M * A/B/D memory.low = 25M, memory.current = 50M * A/B/E memory.low = 0, memory.current = 50M @@ -417,9 +420,11 @@ cleanup: * * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M - * A/B/ memory.current ~= 33M - * A/B/D memory.current ~= 17M - * A/B/F memory.current ~= 0 + * A/B/C memory.current ~= 29M + * A/B/D memory.current ~= 21M + * A/B/E memory.current ~= 0 + * A/B/F memory.current = 0 + * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is * unprotected memory in A available, @@ -512,10 +517,10 @@ static int test_memcg_low(const char *root) for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); - if (!values_close(c[0], MB(33), 10)) + if (!values_close(c[0], MB(29), 10)) goto cleanup; - if (!values_close(c[1], MB(17), 10)) + if (!values_close(c[1], MB(21), 10)) goto cleanup; if (c[3] != 0) From 6a35919005d4146aee14339a6cd52286465e5023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 18 May 2022 18:18:58 +0200 Subject: [PATCH 15/25] selftests: memcg: remove protection from top level memcg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reclaim is triggered by memory limit in a subtree, therefore the testcase does not need configured protection against external reclaim. Also, correct respective comments. Link: https://lkml.kernel.org/r/20220518161859.21565-5-mkoutny@suse.com Signed-off-by: Michal Koutný Acked-by: Roman Gushchin Cc: David Vernet Cc: Johannes Weiner Cc: Michal Hocko Cc: Richard Palethorpe Cc: Shakeel Butt Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 955eee05d60e..aa70999ace80 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -247,7 +247,7 @@ static int cg_test_proc_killed(const char *cgroup) /* * First, this test creates the following hierarchy: - * A memory.min = 50M, memory.max = 200M + * A memory.min = 0, memory.max = 200M * A/B memory.min = 50M * A/B/C memory.min = 75M, memory.current = 50M * A/B/D memory.min = 25M, memory.current = 50M @@ -257,7 +257,7 @@ static int cg_test_proc_killed(const char *cgroup) * Usages are pagecache, but the test keeps a running * process in every leaf cgroup. * Then it creates A/G and creates a significant - * memory pressure in it. + * memory pressure in A. * * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M @@ -338,8 +338,6 @@ static int test_memcg_min(const char *root) (void *)(long)fd); } - if (cg_write(parent[0], "memory.min", "50M")) - goto cleanup; if (cg_write(parent[1], "memory.min", "50M")) goto cleanup; if (cg_write(children[0], "memory.min", "75M")) @@ -407,7 +405,7 @@ cleanup: /* * First, this test creates the following hierarchy: - * A memory.low = 50M, memory.max = 200M + * A memory.low = 0, memory.max = 200M * A/B memory.low = 50M * A/B/C memory.low = 75M, memory.current = 50M * A/B/D memory.low = 25M, memory.current = 50M @@ -495,8 +493,6 @@ static int test_memcg_low(const char *root) goto cleanup; } - if (cg_write(parent[0], "memory.low", "50M")) - goto cleanup; if (cg_write(parent[1], "memory.low", "50M")) goto cleanup; if (cg_write(children[0], "memory.low", "75M")) From f079a020ba95b568329806aa13c62e6103cade3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 18 May 2022 18:18:59 +0200 Subject: [PATCH 16/25] selftests: memcg: factor out common parts of memory.{low,min} tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory protection test setup and runtime is almost equal for memory.low and memory.min cases. It makes modification of the common parts prone to mistakes, since the protections are similar not only in setup but also in principle, factor the common part out. Past exceptions between the tests: - missing memory.min is fine (kept), - test_memcg_low protected orphaned pagecache (adapted like test_memcg_min and we keep the processes of protected memory running). The evaluation in two tests is different (OOM of allocator vs low events of protégés), this is kept different. Link: https://lkml.kernel.org/r/20220518161859.21565-6-mkoutny@suse.com Signed-off-by: Michal Koutný Acked-by: Roman Gushchin CC: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Richard Palethorpe Cc: David Vernet Signed-off-by: Andrew Morton --- .../selftests/cgroup/test_memcontrol.c | 207 ++++-------------- 1 file changed, 40 insertions(+), 167 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index aa70999ace80..8833359556f3 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -190,13 +190,6 @@ cleanup: return ret; } -static int alloc_pagecache_50M(const char *cgroup, void *arg) -{ - int fd = (long)arg; - - return alloc_pagecache(fd, MB(50)); -} - static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) { int fd = (long)arg; @@ -254,7 +247,9 @@ static int cg_test_proc_killed(const char *cgroup) * A/B/E memory.min = 0, memory.current = 50M * A/B/F memory.min = 500M, memory.current = 0 * - * Usages are pagecache, but the test keeps a running + * (or memory.low if we test soft protection) + * + * Usages are pagecache and the test keeps a running * process in every leaf cgroup. * Then it creates A/G and creates a significant * memory pressure in A. @@ -268,15 +263,16 @@ static int cg_test_proc_killed(const char *cgroup) * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is - * unprotected memory in A available, and checks - * checks that memory.min protects pagecache even - * in this case. + * unprotected memory in A available, and checks that: + * a) memory.min protects pagecache even in this case, + * b) memory.low allows reclaiming page cache with low events. */ -static int test_memcg_min(const char *root) +static int test_memcg_protection(const char *root, bool min) { - int ret = KSFT_FAIL; + int ret = KSFT_FAIL, rc; char *parent[3] = {NULL}; char *children[4] = {NULL}; + const char *attribute = min ? "memory.min" : "memory.low"; long c[4]; int i, attempts; int fd; @@ -300,8 +296,10 @@ static int test_memcg_min(const char *root) if (cg_create(parent[0])) goto cleanup; - if (cg_read_long(parent[0], "memory.min")) { - ret = KSFT_SKIP; + if (cg_read_long(parent[0], attribute)) { + /* No memory.min on older kernels is fine */ + if (min) + ret = KSFT_SKIP; goto cleanup; } @@ -338,15 +336,15 @@ static int test_memcg_min(const char *root) (void *)(long)fd); } - if (cg_write(parent[1], "memory.min", "50M")) + if (cg_write(parent[1], attribute, "50M")) goto cleanup; - if (cg_write(children[0], "memory.min", "75M")) + if (cg_write(children[0], attribute, "75M")) goto cleanup; - if (cg_write(children[1], "memory.min", "25M")) + if (cg_write(children[1], attribute, "25M")) goto cleanup; - if (cg_write(children[2], "memory.min", "0")) + if (cg_write(children[2], attribute, "0")) goto cleanup; - if (cg_write(children[3], "memory.min", "500M")) + if (cg_write(children[3], attribute, "500M")) goto cleanup; attempts = 0; @@ -375,161 +373,26 @@ static int test_memcg_min(const char *root) if (c[3] != 0) goto cleanup; - if (!cg_run(parent[2], alloc_anon, (void *)MB(170))) + rc = cg_run(parent[2], alloc_anon, (void *)MB(170)); + if (min && !rc) goto cleanup; - - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) - goto cleanup; - - ret = KSFT_PASS; - -cleanup: - for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { - if (!children[i]) - continue; - - cg_destroy(children[i]); - free(children[i]); - } - - for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { - if (!parent[i]) - continue; - - cg_destroy(parent[i]); - free(parent[i]); - } - close(fd); - return ret; -} - -/* - * First, this test creates the following hierarchy: - * A memory.low = 0, memory.max = 200M - * A/B memory.low = 50M - * A/B/C memory.low = 75M, memory.current = 50M - * A/B/D memory.low = 25M, memory.current = 50M - * A/B/E memory.low = 0, memory.current = 50M - * A/B/F memory.low = 500M, memory.current = 0 - * - * Usages are pagecache. - * Then it creates A/G an creates a significant - * memory pressure in it. - * - * Then it checks actual memory usages and expects that: - * A/B memory.current ~= 50M - * A/B/C memory.current ~= 29M - * A/B/D memory.current ~= 21M - * A/B/E memory.current ~= 0 - * A/B/F memory.current = 0 - * (for origin of the numbers, see model in memcg_protection.m.) - * - * After that it tries to allocate more than there is - * unprotected memory in A available, - * and checks low and oom events in memory.events. - */ -static int test_memcg_low(const char *root) -{ - int ret = KSFT_FAIL; - char *parent[3] = {NULL}; - char *children[4] = {NULL}; - long low, oom; - long c[4]; - int i; - int fd; - - fd = get_temp_fd(); - if (fd < 0) - goto cleanup; - - parent[0] = cg_name(root, "memcg_test_0"); - if (!parent[0]) - goto cleanup; - - parent[1] = cg_name(parent[0], "memcg_test_1"); - if (!parent[1]) - goto cleanup; - - parent[2] = cg_name(parent[0], "memcg_test_2"); - if (!parent[2]) - goto cleanup; - - if (cg_create(parent[0])) - goto cleanup; - - if (cg_read_long(parent[0], "memory.low")) - goto cleanup; - - if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) - goto cleanup; - - if (cg_write(parent[0], "memory.max", "200M")) - goto cleanup; - - if (cg_write(parent[0], "memory.swap.max", "0")) - goto cleanup; - - if (cg_create(parent[1])) - goto cleanup; - - if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) - goto cleanup; - - if (cg_create(parent[2])) - goto cleanup; - - for (i = 0; i < ARRAY_SIZE(children); i++) { - children[i] = cg_name_indexed(parent[1], "child_memcg", i); - if (!children[i]) - goto cleanup; - - if (cg_create(children[i])) - goto cleanup; - - if (i > 2) - continue; - - if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd)) - goto cleanup; - } - - if (cg_write(parent[1], "memory.low", "50M")) - goto cleanup; - if (cg_write(children[0], "memory.low", "75M")) - goto cleanup; - if (cg_write(children[1], "memory.low", "25M")) - goto cleanup; - if (cg_write(children[2], "memory.low", "0")) - goto cleanup; - if (cg_write(children[3], "memory.low", "500M")) - goto cleanup; - - if (cg_run(parent[2], alloc_anon, (void *)MB(148))) - goto cleanup; - - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) - goto cleanup; - - for (i = 0; i < ARRAY_SIZE(children); i++) - c[i] = cg_read_long(children[i], "memory.current"); - - if (!values_close(c[0], MB(29), 10)) - goto cleanup; - - if (!values_close(c[1], MB(21), 10)) - goto cleanup; - - if (c[3] != 0) - goto cleanup; - - if (cg_run(parent[2], alloc_anon, (void *)MB(166))) { + else if (!min && rc) { fprintf(stderr, "memory.low prevents from allocating anon memory\n"); goto cleanup; } + if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) + goto cleanup; + + if (min) { + ret = KSFT_PASS; + goto cleanup; + } + for (i = 0; i < ARRAY_SIZE(children); i++) { int no_low_events_index = 1; + long low, oom; oom = cg_read_key_long(children[i], "memory.events", "oom "); low = cg_read_key_long(children[i], "memory.events", "low "); @@ -565,6 +428,16 @@ cleanup: return ret; } +static int test_memcg_min(const char *root) +{ + return test_memcg_protection(root, true); +} + +static int test_memcg_low(const char *root) +{ + return test_memcg_protection(root, false); +} + static int alloc_pagecache_max_30M(const char *cgroup, void *arg) { size_t size = MB(50); From 9f186f9e5fa9ebdaef909fd45f825a6ce281f13c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:26 +0800 Subject: [PATCH 17/25] mm/swapfile: unuse_pte can map random data if swap read fails Patch series "A few fixup patches for mm", v4. This series contains a few patches to avoid mapping random data if swap read fails and fix lost swap bits in unuse_pte. Also we free hwpoison and swapin error entry in madvise_free_pte_range and so on. More details can be found in the respective changelogs. This patch (of 5): There is a bug in unuse_pte(): when swap page happens to be unreadable, page filled with random data is mapped into user address space. In case of error, a special swap entry indicating swap read fails is set to the page table. So the swapcache page can be freed and the user won't end up with a permanently mounted swap because a sector is bad. And if the page is accessed later, the user process will be killed so that corrupted data is never consumed. On the other hand, if the page is never accessed, the user won't even notice it. Link: https://lkml.kernel.org/r/20220519125030.21486-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220519125030.21486-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: David Howells Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Cc: Ralph Campbell Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++++++- include/linux/swapops.h | 10 ++++++++++ mm/memory.c | 5 ++++- mm/swapfile.c | 11 +++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f3ae17b43f20..0c0fed1b348f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void) * actions on faults. */ +#define SWP_SWAPIN_ERROR_NUM 1 +#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ + SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that are mapped with * file-backed memories. As its name "PTE" hints, it should only be applied to @@ -120,7 +124,8 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ + SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index fe220df499f1..f24775b41880 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } +static inline swp_entry_t make_swapin_error_entry(struct page *page) +{ + return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_SWAPIN_ERROR; +} + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { diff --git a/mm/memory.c b/mm/memory.c index 2bf5bca39567..54d106e0c999 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1487,7 +1487,8 @@ again: /* Only drop the uffd-wp marker if explicitly requested */ if (!zap_drop_file_uffd_wp(details)) continue; - } else if (is_hwpoison_entry(entry)) { + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { if (!should_zap_cows(details)) continue; } else { @@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; + } else if (is_swapin_error_entry(entry)) { + ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { diff --git a/mm/swapfile.c b/mm/swapfile.c index a0eb690d9926..b86d1cc8d00b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1788,6 +1788,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + if (unlikely(!PageUptodate(page))) { + pte_t pteval; + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + set_pte_at(vma->vm_mm, addr, pte, pteval); + swap_free(entry); + ret = 0; + goto out; + } + /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); From 14a762dd1977cf811516fd97b0262b747cac88f7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:27 +0800 Subject: [PATCH 18/25] mm/swapfile: fix lost swap bits in unuse_pte() This is observed by code review only but not any real report. When we turn off swapping we could have lost the bits stored in the swap ptes. The new rmap-exclusive bit is fine since that turned into a page flag, but not for soft-dirty and uffd-wp. Add them. Link: https://lkml.kernel.org/r/20220519125030.21486-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: David Howells Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: NeilBrown Cc: Ralph Campbell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/swapfile.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b86d1cc8d00b..e45874fb2ec7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1774,7 +1774,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, { struct page *swapcache; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, new_pte; int ret = 1; swapcache = page; @@ -1823,8 +1823,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); + new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*pte)) + new_pte = pte_mksoft_dirty(new_pte); + if (pte_swp_uffd_wp(*pte)) + new_pte = pte_mkuffd_wp(new_pte); + set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: pte_unmap_unlock(pte, ptl); From 7b49514fa1dbe05813f099179ebe3d982f45e87e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:28 +0800 Subject: [PATCH 19/25] mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range Once the MADV_FREE operation has succeeded, callers can expect they might get zero-fill pages if accessing the memory again. Therefore it should be safe to delete the hwpoison entry and swapin error entry. There is no reason to kill the process if it has called MADV_FREE on the range. Link: https://lkml.kernel.org/r/20220519125030.21486-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Naoya Horiguchi Cc: David Howells Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Ralph Campbell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 4d6592488b51..5f4537511532 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -624,11 +624,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, swp_entry_t entry; entry = pte_to_swp_entry(ptent); - if (non_swap_entry(entry)) - continue; - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + if (!non_swap_entry(entry)) { + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } continue; } From 6cec2b95dadf77cdc1256fae1c5dfd4a2b467e61 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:29 +0800 Subject: [PATCH 20/25] mm/shmem: fix infinite loop when swap in shmem error at swapoff time When swap in shmem error at swapoff time, there would be a infinite loop in the while loop in shmem_unuse_inode(). It's because swapin error is deliberately ignored now and thus info->swapped will never reach 0. So we can't escape the loop in shmem_unuse(). In order to fix the issue, swapin_error entry is stored in the mapping when swapin error occurs. So the swapcache page can be freed and the user won't end up with a permanently mounted swap because a sector is bad. If the page is accessed later, the user process will be killed so that corrupted data is never consumed. On the other hand, if the page is never accessed, the user won't even notice it. Link: https://lkml.kernel.org/r/20220519125030.21486-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reported-by: Naoya Horiguchi Reviewed-by: Naoya Horiguchi Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Ralph Campbell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index b8169beff226..54149ce679be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping, continue; entry = radix_to_swp_entry(folio); + /* + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. + */ if (swp_type(entry) != type) continue; @@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, return error; } +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swapin_error; + void *old; + + swapin_error = make_swapin_error_entry(&folio->page); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + folio_wait_writeback(folio); + delete_from_swap_cache(&folio->page); + spin_lock_irq(&info->lock); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in + * shmem_evict_inode. + */ + info->alloced--; + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + swap_free(swap); +} + /* * Swap in the page pointed to by *pagep. * Caller has to make sure that *pagep contains a valid swapped page. @@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*foliop); *foliop = NULL; + if (is_swapin_error_entry(swap)) + return -EIO; + /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) { folio_unlock(folio); From ba6851b45d2d5b07436d8fc43451bad354dc4884 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:30 +0800 Subject: [PATCH 21/25] mm: filter out swapin error entry in shmem mapping There might be swapin error entries in shmem mapping. Filter them out to avoid "Bad swap file entry" complaint. Link: https://lkml.kernel.org/r/20220519125030.21486-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Naoya Horiguchi Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Ralph Campbell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 5 ++++- mm/swap_state.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5f4537511532..d7b4f2602949 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, if (!xa_is_value(page)) continue; + swap = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swap)) + continue; xas_pause(&xas); rcu_read_unlock(); - swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false, &splug); if (page) diff --git a/mm/swap_state.c b/mm/swap_state.c index b9e4ed2e90bf..778d57d2d92d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) return NULL; swp = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return NULL; /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) From 1c563432588dbffa71e67ca6e37c826f9fa86e04 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 24 May 2022 10:15:25 -0700 Subject: [PATCH 22/25] mm: fix is_pinnable_page against a cma page Pages in the CMA area could have MIGRATE_ISOLATE as well as MIGRATE_CMA so the current is_pinnable_page() could miss CMA pages which have MIGRATE_ISOLATE. It ends up pinning CMA pages as longterm for the pin_user_pages() API so CMA allocations keep failing until the pin is released. CPU 0 CPU 1 - Task B cma_alloc alloc_contig_range pin_user_pages_fast(FOLL_LONGTERM) change pageblock as MIGRATE_ISOLATE internal_get_user_pages_fast lockless_pages_from_mm gup_pte_range try_grab_folio is_pinnable_page return true; So, pinned the page successfully. page migration failure with pinned page .. .. After 30 sec unpin_user_page(page) CMA allocation succeeded after 30 sec. The CMA allocation path protects the migration type change race using zone->lock but what GUP path need to know is just whether the page is on CMA area or not rather than exact migration type. Thus, we don't need zone->lock but just checks migration type in either of (MIGRATE_ISOLATE and MIGRATE_CMA). Adding the MIGRATE_ISOLATE check in is_pinnable_page could cause rejecting of pinning pages on MIGRATE_ISOLATE pageblocks even though it's neither CMA nor movable zone if the page is temporarily unmovable. However, such a migration failure by unexpected temporal refcount holding is general issue, not only come from MIGRATE_ISOLATE and the MIGRATE_ISOLATE is also transient state like other temporal elevated refcount problem. Link: https://lkml.kernel.org/r/20220524171525.976723-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Acked-by: Paul E. McKenney Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +++++++-- mm/page_alloc.c | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index de32c0383387..93a46ff33dc2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || - is_zero_pfn(page_to_pfn(page)); +#ifdef CONFIG_CMA + int mt = get_pageblock_migratetype(page); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f6e4649ac21..56b3a5d67325 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); return (word >> bitidx) & mask; } From fbf4df0699926cf620b2f722ddc213826e248962 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 25 May 2022 20:08:04 +0800 Subject: [PATCH 23/25] mm: kasan: fix input of vmalloc_to_page() When print virtual mapping info for vmalloc address, it should pass the addr not page, fix it. Link: https://lkml.kernel.org/r/20220525120804.38155-1-wangkefeng.wang@huawei.com Fixes: c056a364e954 ("kasan: print virtual mapping info in reports") Signed-off-by: Kefeng Wang Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 199d77cce21a..b341a191651d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag) va->addr, va->addr + va->size, va->caller); pr_err("\n"); - page = vmalloc_to_page(page); + page = vmalloc_to_page(addr); } } From 0710d0122abc93adcb9a70a78f1625c491f6ad91 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 25 May 2022 13:25:59 +0200 Subject: [PATCH 24/25] mm: Kconfig: reorganize misplaced mm options After commits 7b42f1041c98 ("mm: Kconfig: move swap and slab config options to the MM section") and 519bcb797907 ("mm: Kconfig: group swap, slab, hotplug and thp options into submenus") we now have nicely organized mm related config options. I have noticed some that were still misplaced, so this moves them from various places into the new structure: VM_EVENT_COUNTERS, COMPAT_BRK, MMAP_ALLOW_UNINITIALIZED to mm/Kconfig and general MM section. SLUB_STATS to mm/Kconfig and the slab submenu. DEBUG_SLAB, SLUB_DEBUG, SLUB_DEBUG_ON to mm/Kconfig.debug and the Kernel hacking / Memory Debugging submenu. Link: https://lkml.kernel.org/r/20220525112559.1139-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- init/Kconfig | 53 -------------------------------------------- lib/Kconfig.debug | 34 ---------------------------- mm/Kconfig | 56 +++++++++++++++++++++++++++++++++++++++++++++++ mm/Kconfig.debug | 31 ++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 87 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 1d2ecd227f4b..0cff02738290 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1828,59 +1828,6 @@ config DEBUG_PERF_USE_VMALLOC endmenu -config VM_EVENT_COUNTERS - default y - bool "Enable VM event counters for /proc/vmstat" if EXPERT - help - VM event counters are needed for event counts to be shown. - This option allows the disabling of the VM event counters - on EXPERT systems. /proc/vmstat will only show page counts - if VM event counters are disabled. - -config SLUB_DEBUG - default y - bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS - help - SLUB has extensive debug support features. Disabling these can - result in significant savings in code size. This also disables - SLUB sysfs support. /sys/slab will not exist and there will be - no support for cache validation etc. - -config COMPAT_BRK - bool "Disable heap randomization" - default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). - This option changes the bootup default to heap randomization - disabled, and can be overridden at runtime by setting - /proc/sys/kernel/randomize_va_space to 2. - - On non-ancient distros (post-2000 ones) N is usually a safe choice. - -config MMAP_ALLOW_UNINITIALIZED - bool "Allow mmapped anonymous memory to be uninitialized" - depends on EXPERT && !MMU - default n - help - Normally, and according to the Linux spec, anonymous memory obtained - from mmap() has its contents cleared before it is passed to - userspace. Enabling this config option allows you to request that - mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus - providing a huge performance boost. If this option is not enabled, - then the flag will be ignored. - - This is taken advantage of by uClibc's malloc(), and also by - ELF-FDPIC binfmt's brk and stack allocator. - - Because of the obvious security issues, this option should only be - enabled on embedded devices where you control what is run in - userspace. Since that isn't generally a problem on no-MMU systems, - it is normally safe to say Y here. - - See Documentation/admin-guide/mm/nommu-mmap.rst for more information. - config SYSTEM_DATA_VERIFICATION def_bool n select SYSTEM_TRUSTED_KEYRING diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac..25763dcc7c4b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -698,40 +698,6 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT help Debug objects boot parameter default value -config DEBUG_SLAB - bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB - help - Say Y here to have the kernel do limited verification on memory - allocation as well as poisoning memory on free to catch use of freed - memory. This can make kmalloc/kfree-intensive workloads much slower. - -config SLUB_DEBUG_ON - bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG - default n - help - Boot with debugging on by default. SLUB boots by default with - the runtime debug capabilities switched off. Enabling this is - equivalent to specifying the "slub_debug" parameter on boot. - There is no support for more fine grained debug control like - possible with slub_debug=xxx. SLUB debugging may be switched - off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying - "slub_debug=-". - -config SLUB_STATS - default n - bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS - help - SLUB statistics are useful to debug SLUBs allocation behavior in - order find ways to optimize the allocator. This should never be - enabled for production use since keeping statistics slows down - the allocator by a few percentage points. The slabinfo command - supports the determination of the most active slabs to figure - out which slabs are relevant to a particular load. - Try running: slabinfo -DA - config HAVE_DEBUG_KMEMLEAK bool diff --git a/mm/Kconfig b/mm/Kconfig index 905c205e14f3..169e64192e48 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED sanity-checking than others. This option is most effective with CONFIG_SLUB. +config SLUB_STATS + default n + bool "Enable SLUB performance statistics" + depends on SLUB && SYSFS + help + SLUB statistics are useful to debug SLUBs allocation behavior in + order find ways to optimize the allocator. This should never be + enabled for production use since keeping statistics slows down + the allocator by a few percentage points. The slabinfo command + supports the determination of the most active slabs to figure + out which slabs are relevant to a particular load. + Try running: slabinfo -DA + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP @@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR Say Y if unsure. +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + config PERCPU_STATS bool "Collect percpu memory statistics" help diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5bd5bb097252..197eb287bf82 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -45,6 +45,37 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config DEBUG_SLAB + bool "Debug slab memory allocations" + depends on DEBUG_KERNEL && SLAB + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed + memory. This can make kmalloc/kfree-intensive workloads much slower. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_DEBUG_ON + bool "SLUB debugging on by default" + depends on SLUB && SLUB_DEBUG + default n + help + Boot with debugging on by default. SLUB boots by default with + the runtime debug capabilities switched off. Enabling this is + equivalent to specifying the "slub_debug" parameter on boot. + There is no support for more fine grained debug control like + possible with slub_debug=xxx. SLUB debugging may be switched + off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying + "slub_debug=-". + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT From fa020a2b87d24016723fff4a4237deb612478a32 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 25 May 2022 15:17:09 -0700 Subject: [PATCH 25/25] mm/shmem.c: suppress shift warning mm/shmem.c:1948 shmem_getpage_gfp() warn: should '(((1) << 12) / 512) << folio_order(folio)' be a 64 bit type? On i386, so an unsigned long is 32-bit, but i_blocks is a 64-bit blkcnt_t. Reported-by: kernel test robot Reported-by: Jessica Clarke Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 54149ce679be..c24f684022fd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1945,7 +1945,7 @@ alloc_nohuge: spin_lock_irq(&info->lock); info->alloced += folio_nr_pages(folio); - inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio); + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true;