From 04a42e72d77a93a166b79c34b7bc862f55a53967 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Dec 2022 22:17:57 -0800 Subject: [PATCH 001/505] mm: move folio_set_compound_order() to mm/internal.h folio_set_compound_order() is moved to an mm-internal location so external folio users cannot misuse this function. Change the name of the function to folio_set_order() and use WARN_ON_ONCE() rather than BUG_ON. Also, handle the case if a non-large folio is passed and add clarifying comments to the function. Link: https://lore.kernel.org/lkml/20221207223731.32784-1-sidhartha.kumar@oracle.com/T/ Link: https://lkml.kernel.org/r/20221215061757.223440-1-sidhartha.kumar@oracle.com Fixes: 9fd330582b2f ("mm: add folio dtor and order setter functions") Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Suggested-by: Matthew Wilcox Suggested-by: John Hubbard Reviewed-by: John Hubbard Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ---------------- mm/hugetlb.c | 6 +++--- mm/internal.h | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..253b2d7489e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1019,22 +1019,6 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } -/* - * folio_set_compound_order is generally passed a non-zero order to - * initialize a large folio. However, hugetlb code abuses this by - * passing in zero when 'dissolving' a large folio. - */ -static inline void folio_set_compound_order(struct folio *folio, - unsigned int order) -{ - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - - folio->_folio_order = order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} - /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bdbfeb6fb393..cfd47a66ded0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1492,7 +1492,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, set_page_refcounted(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); } @@ -1956,7 +1956,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, __folio_clear_reserved(folio); __folio_set_head(folio); /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_compound_order(folio, order); + folio_set_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); @@ -2020,7 +2020,7 @@ out_error: p = folio_page(folio, j); __ClearPageReserved(p); } - folio_set_compound_order(folio, 0); + folio_set_order(folio, 0); __folio_clear_head(folio); return false; } diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..1d6f4e168510 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -378,6 +378,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + /* + * When hugetlb dissolves a folio, we need to clear the tail + * page, rather than setting nr_pages to 1. + */ + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* From f1eb1bacfba9019823b2fce42383f010cd561fa6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 14 Dec 2022 15:15:33 -0500 Subject: [PATCH 002/505] mm/uffd: always wr-protect pte in pte|pmd_mkuffd_wp() This patch is a cleanup to always wr-protect pte/pmd in mkuffd_wp paths. The reasons I still think this patch is worthwhile, are: (1) It is a cleanup already; diffstat tells. (2) It just feels natural after I thought about this, if the pte is uffd protected, let's remove the write bit no matter what it was. (2) Since x86 is the only arch that supports uffd-wp, it also redefines pte|pmd_mkuffd_wp() in that it should always contain removals of write bits. It means any future arch that want to implement uffd-wp should naturally follow this rule too. It's good to make it a default, even if with vm_page_prot changes on VM_UFFD_WP. (3) It covers more than vm_page_prot. So no chance of any potential future "accident" (like pte_mkdirty() sparc64 or loongarch, even though it just got its pte_mkdirty fixed <1 month ago). It'll be fairly clear when reading the code too that we don't worry anything before a pte_mkuffd_wp() on uncertainty of the write bit. We may call pte_wrprotect() one more time in some paths (e.g. thp split), but that should be fully local bitop instruction so the overhead should be negligible. Although this patch should logically also fix all the known issues on uffd-wp too recently on page migration (not for numa hint recovery - that may need another explcit pte_wrprotect), but this is not the plan for that fix. So no fixes, and stable doesn't need this. Link: https://lkml.kernel.org/r/20221214201533.1774616-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ives van Hoorne Cc: Mike Kravetz Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 24 ++++++++++++------------ include/asm-generic/hugetlb.h | 16 ++++++++-------- mm/huge_memory.c | 8 +++----- mm/hugetlb.c | 4 ++-- mm/memory.c | 8 +++----- mm/mprotect.c | 6 ++---- mm/userfaultfd.c | 18 ++---------------- 7 files changed, 32 insertions(+), 52 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0564edd24ffb..1c843395a8b3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -289,6 +289,11 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { @@ -313,7 +318,7 @@ static inline int pte_uffd_wp(pte_t pte) static inline pte_t pte_mkuffd_wp(pte_t pte) { - return pte_set_flags(pte, _PAGE_UFFD_WP); + return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) @@ -332,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte) return pte_clear_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_RW); -} - static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); @@ -401,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pmd_uffd_wp(pmd_t pmd) { @@ -409,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd) static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_UFFD_WP); + return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) @@ -428,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_DIRTY); } -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - static inline pmd_t pmd_mkdirty(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index a57d667addd2..d7f6335d3999 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte) return pte_mkwrite(pte); } +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); @@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { - return pte_mkuffd_wp(pte); + return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) @@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte) return huge_pte_none(pte) || is_pte_marker(pte); } -#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} -#endif - #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index abe6cfd92ffa..867f02e6061d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1920,17 +1920,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (uffd_wp) { - entry = pmd_wrprotect(entry); + if (uffd_wp) entry = pmd_mkuffd_wp(entry); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled. */ entry = pmd_clear_uffd_wp(entry); - } /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && @@ -3275,7 +3273,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) - pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cfd47a66ded0..92b3fd01a652 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5919,7 +5919,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * if populated. */ if (unlikely(pte_marker_uffd_wp(old_pte))) - new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); + new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); @@ -6728,7 +6728,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) - pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); diff --git a/mm/memory.c b/mm/memory.c index 3e836fecd035..ca490596b36f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -878,7 +878,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ - pte = pte_wrprotect(pte_mkuffd_wp(pte)); + pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -3950,10 +3950,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - if (pte_swp_uffd_wp(vmf->orig_pte)) { + if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - pte = pte_wrprotect(pte); - } vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -4296,7 +4294,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(uffd_wp)) - entry = pte_mkuffd_wp(pte_wrprotect(entry)); + entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter(vma->vm_mm, MM_ANONPAGES); diff --git a/mm/mprotect.c b/mm/mprotect.c index 61cf60015a8b..bf8fa0af5a15 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (uffd_wp) { - ptent = pte_wrprotect(ptent); + if (uffd_wp) ptent = pte_mkuffd_wp(ptent); - } else if (uffd_wp_resolve) { + else if (uffd_wp_resolve) ptent = pte_clear_uffd_wp(ptent); - } /* * In some writable, shared mappings, we might want diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0499907b6f1a..f8d31b82aceb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - - /* - * Always mark a PTE as write-protected when needed, regardless of - * VM_WRITE, which the user might change. - */ - if (wp_copy) { - _dst_pte = pte_mkuffd_wp(_dst_pte); - writable = false; - } - if (writable) _dst_pte = pte_mkwrite(_dst_pte); - else - /* - * We need this to make sure write bit removed; as mk_pte() - * could return a pte with write bit set. - */ - _dst_pte = pte_wrprotect(_dst_pte); + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); From 6fd7353829cafc4067aad9eea0dc95da67e7df16 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 15 Dec 2022 00:12:01 +0000 Subject: [PATCH 003/505] mm/memfd: add F_SEAL_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/memfd: introduce MFD_NOEXEC_SEAL and MFD_EXEC", v8. Since Linux introduced the memfd feature, memfd have always had their execute bit set, and the memfd_create() syscall doesn't allow setting it differently. However, in a secure by default system, such as ChromeOS, (where all executables should come from the rootfs, which is protected by Verified boot), this executable nature of memfd opens a door for NoExec bypass and enables “confused deputy attack”. E.g, in VRP bug [1]: cros_vm process created a memfd to share the content with an external process, however the memfd is overwritten and used for executing arbitrary code and root escalation. [2] lists more VRP in this kind. On the other hand, executable memfd has its legit use, runc uses memfd’s seal and executable feature to copy the contents of the binary then execute them, for such system, we need a solution to differentiate runc's use of executable memfds and an attacker's [3]. To address those above, this set of patches add following: 1> Let memfd_create() set X bit at creation time. 2> Let memfd to be sealed for modifying X bit. 3> A new pid namespace sysctl: vm.memfd_noexec to control the behavior of X bit.For example, if a container has vm.memfd_noexec=2, then memfd_create() without MFD_NOEXEC_SEAL will be rejected. 4> A new security hook in memfd_create(). This make it possible to a new LSM, which rejects or allows executable memfd based on its security policy. This patch (of 5): The new F_SEAL_EXEC flag will prevent modification of the exec bits: written as traditional octal mask, 0111, or as named flags, S_IXUSR | S_IXGRP | S_IXOTH. Any chmod(2) or similar call that attempts to modify any of these bits after the seal is applied will fail with errno EPERM. This will preserve the execute bits as they are at the time of sealing, so the memfd will become either permanently executable or permanently un-executable. Link: https://lkml.kernel.org/r/20221215001205.51969-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20221215001205.51969-2-jeffxu@google.com Signed-off-by: Daniel Verkamp Co-developed-by: Jeff Xu Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Shuah Khan Cc: David Herrmann Cc: kernel test robot Signed-off-by: Andrew Morton --- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 2 ++ mm/shmem.c | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 2f86b2ad6d7e..e8c07da58c9f 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,7 @@ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 08f5f8304746..4ebeab94aa74 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -147,6 +147,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) } #define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ @@ -175,6 +176,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file + * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file diff --git a/mm/shmem.c b/mm/shmem.c index 0005ab2c29af..d3f0c94f6836 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1093,6 +1093,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; + if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { + if ((inode->i_mode ^ attr->ia_mode) & 0111) { + return -EPERM; + } + } + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; From 32d118ad50a5afecb74358bcefc5cb6ea6ccfc2b Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Thu, 15 Dec 2022 00:12:02 +0000 Subject: [PATCH 004/505] selftests/memfd: add tests for F_SEAL_EXEC Basic tests to ensure that user/group/other execute bits cannot be changed after applying F_SEAL_EXEC to a memfd. Link: https://lkml.kernel.org/r/20221215001205.51969-3-jeffxu@google.com Signed-off-by: Daniel Verkamp Co-developed-by: Jeff Xu Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 123 ++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 94df2692e6e4..f18a15a1f275 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -28,12 +28,38 @@ #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 +#define F_SEAL_EXEC 0x0020 + /* * Default is not to test hugetlbfs */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; +static ssize_t fd2name(int fd, char *buf, size_t bufsize) +{ + char buf1[PATH_MAX]; + int size; + ssize_t nbytes; + + size = snprintf(buf1, PATH_MAX, "/proc/self/fd/%d", fd); + if (size < 0) { + printf("snprintf(%d) failed on %m\n", fd); + abort(); + } + + /* + * reserver one byte for string termination. + */ + nbytes = readlink(buf1, buf, bufsize-1); + if (nbytes == -1) { + printf("readlink(%s) failed %m\n", buf1); + abort(); + } + buf[nbytes] = '\0'; + return nbytes; +} + static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; @@ -98,11 +124,14 @@ static unsigned int mfd_assert_get_seals(int fd) static void mfd_assert_has_seals(int fd, unsigned int seals) { + char buf[PATH_MAX]; + int nbytes; unsigned int s; + fd2name(fd, buf, PATH_MAX); s = mfd_assert_get_seals(fd); if (s != seals) { - printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd); + printf("%u != %u = GET_SEALS(%s)\n", seals, s, buf); abort(); } } @@ -594,6 +623,64 @@ static void mfd_fail_grow_write(int fd) } } +static void mfd_assert_mode(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if ((st.st_mode & 07777) != mode) { + printf("fstat(%s) wrong file mode 0%04o, but expected 0%04o\n", + buf, (int)st.st_mode & 07777, mode); + abort(); + } +} + +static void mfd_assert_chmod(int fd, int mode) +{ + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fchmod(fd, mode) < 0) { + printf("fchmod(%s, 0%04o) failed: %m\n", buf, mode); + abort(); + } + + mfd_assert_mode(fd, mode); +} + +static void mfd_fail_chmod(int fd, int mode) +{ + struct stat st; + char buf[PATH_MAX]; + int nbytes; + + fd2name(fd, buf, PATH_MAX); + + if (fstat(fd, &st) < 0) { + printf("fstat(%s) failed: %m\n", buf); + abort(); + } + + if (fchmod(fd, mode) == 0) { + printf("fchmod(%s, 0%04o) didn't fail as expected\n", + buf, mode); + abort(); + } + + /* verify that file mode bits did not change */ + mfd_assert_mode(fd, st.st_mode & 07777); +} + static int idle_thread_fn(void *arg) { sigset_t set; @@ -880,6 +967,39 @@ static void test_seal_resize(void) close(fd); } +/* + * Test SEAL_EXEC + * Test that chmod() cannot change x bits after sealing + */ +static void test_seal_exec(void) +{ + int fd; + + printf("%s SEAL-EXEC\n", memfd_str); + + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + + mfd_assert_chmod(fd, 0644); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + + mfd_assert_chmod(fd, 0600); + mfd_fail_chmod(fd, 0777); + mfd_fail_chmod(fd, 0670); + mfd_fail_chmod(fd, 0605); + mfd_fail_chmod(fd, 0700); + mfd_fail_chmod(fd, 0100); + mfd_assert_chmod(fd, 0666); + + close(fd); +} + /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -1059,6 +1179,7 @@ int main(int argc, char **argv) test_seal_shrink(); test_seal_grow(); test_seal_resize(); + test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); From 105ff5339f498af74e60d7662c8f1c4d21f1342d Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:03 +0000 Subject: [PATCH 005/505] mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC The new MFD_NOEXEC_SEAL and MFD_EXEC flags allows application to set executable bit at creation time (memfd_create). When MFD_NOEXEC_SEAL is set, memfd is created without executable bit (mode:0666), and sealed with F_SEAL_EXEC, so it can't be chmod to be executable (mode: 0777) after creation. when MFD_EXEC flag is set, memfd is created with executable bit (mode:0777), this is the same as the old behavior of memfd_create. The new pid namespaced sysctl vm.memfd_noexec has 3 values: 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_EXEC was set. 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_NOEXEC_SEAL was set. 2: memfd_create() without MFD_NOEXEC_SEAL will be rejected. The sysctl allows finer control of memfd_create for old-software that doesn't set the executable bit, for example, a container with vm.memfd_noexec=1 means the old-software will create non-executable memfd by default. Also, the value of memfd_noexec is passed to child namespace at creation time. For example, if the init namespace has vm.memfd_noexec=2, all its children namespaces will be created with 2. [akpm@linux-foundation.org: add stub functions to fix build] [akpm@linux-foundation.org: remove unneeded register_pid_ns_ctl_table_vm() stub, per Jeff] [akpm@linux-foundation.org: s/pr_warn_ratelimited/pr_warn_once/, per review] [akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warning] Link: https://lkml.kernel.org/r/20221215001205.51969-4-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 19 +++++++++++ include/uapi/linux/memfd.h | 4 +++ kernel/pid_namespace.c | 5 +++ kernel/pid_sysctl.h | 60 +++++++++++++++++++++++++++++++++++ mm/memfd.c | 48 ++++++++++++++++++++++++++-- 5 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 kernel/pid_sysctl.h diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 07481bb87d4e..c758809d5bcf 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,21 @@ struct fs_pin; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +/* + * sysctl for vm.memfd_noexec + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_EXEC was set. + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_NOEXEC_SEAL was set. + * 2: memfd_create() without MFD_NOEXEC_SEAL will be + * rejected. + */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +#endif + struct pid_namespace { struct idr idr; struct rcu_head rcu; @@ -31,6 +46,10 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + /* sysctl for vm.memfd_noexec */ + int memfd_noexec_scope; +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..273a4e15dfcf 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,10 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b4..8a98b1af9376 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; @@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + initialize_memfd_noexec_scope(ns); + return ns; out_free_idr: @@ -455,6 +458,8 @@ static __init int pid_namespaces_init(void) #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif + + register_pid_ns_sysctl_table_vm(); return 0; } diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h new file mode 100644 index 000000000000..e22d072e1e24 --- /dev/null +++ b/kernel/pid_sysctl.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_PID_SYSCTL_H +#define LINUX_PID_SYSCTL_H + +#include + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) +{ + ns->memfd_noexec_scope = + task_active_pid_ns(current)->memfd_noexec_scope; +} + +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, + int write, void *buf, size_t *lenp, loff_t *ppos) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct ctl_table table_copy; + + if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + table_copy = *table; + if (ns != &init_pid_ns) + table_copy.data = &ns->memfd_noexec_scope; + + /* + * set minimum to current value, the effect is only bigger + * value is accepted. + */ + if (*(int *)table_copy.data > *(int *)table_copy.extra1) + table_copy.extra1 = table_copy.data; + + return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table_vm[] = { + { + .procname = "memfd_noexec", + .data = &init_pid_ns.memfd_noexec_scope, + .maxlen = sizeof(init_pid_ns.memfd_noexec_scope), + .mode = 0644, + .proc_handler = pid_mfd_noexec_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; +static inline void register_pid_ns_sysctl_table_vm(void) +{ + register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); +} +#else +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void register_pid_ns_sysctl_table_vm(void) {} +#endif + +#endif /* LINUX_PID_SYSCTL_H */ diff --git a/mm/memfd.c b/mm/memfd.c index 4ebeab94aa74..bc214390e28d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -263,12 +264,13 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { + char comm[TASK_COMM_LEN]; unsigned int *file_seals; struct file *file; int fd, error; @@ -285,6 +287,40 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; } + /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ + if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) + return -EINVAL; + + if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { +#ifdef CONFIG_SYSCTL + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + if (ns) + sysctl = ns->memfd_noexec_scope; + + switch (sysctl) { + case MEMFD_NOEXEC_SCOPE_EXEC: + flags |= MFD_EXEC; + break; + case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL: + flags |= MFD_NOEXEC_SEAL; + break; + default: + pr_warn_once( + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + return -EINVAL; + } +#else + flags |= MFD_EXEC; +#endif + pr_warn_once( + "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + } + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -328,7 +364,15 @@ SYSCALL_DEFINE2(memfd_create, file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) { + if (flags & MFD_NOEXEC_SEAL) { + struct inode *inode = file_inode(file); + + inode->i_mode &= ~0111; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } else if (flags & MFD_ALLOW_SEALING) { + /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); *file_seals &= ~F_SEAL_SEAL; } From c4f75bc8bd6b3d62665e1f5400c419540edb5601 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:04 +0000 Subject: [PATCH 006/505] mm/memfd: add write seals when apply SEAL_EXEC to executable memfd In order to avoid WX mappings, add F_SEAL_WRITE when apply F_SEAL_EXEC to an executable memfd, so W^X from start. This implys application need to fill the content of the memfd first, after F_SEAL_EXEC is applied, application can no longer modify the content of the memfd. Typically, application seals the memfd right after writing to it. For example: 1. memfd_create(MFD_EXEC). 2. write() code to the memfd. 3. fcntl(F_ADD_SEALS, F_SEAL_EXEC) to convert the memfd to W^X. 4. call exec() on the memfd. Link: https://lkml.kernel.org/r/20221215001205.51969-5-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Cc: Daniel Verkamp Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/memfd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/memfd.c b/mm/memfd.c index bc214390e28d..a0a7a37e8177 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -222,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } + /* + * SEAL_EXEC implys SEAL_WRITE, making W^X from the start. + */ + if (seals & F_SEAL_EXEC && inode->i_mode & 0111) + seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; + *file_seals |= seals; error = 0; From 11f75a01448f1b7a739e75dbd8f17b844fcfc510 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:05 +0000 Subject: [PATCH 007/505] selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC Tests to verify MFD_NOEXEC, MFD_EXEC and vm.memfd_noexec sysctl. Link: https://lkml.kernel.org/r/20221215001205.51969-6-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: kernel test robot Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/fuse_test.c | 1 + tools/testing/selftests/memfd/memfd_test.c | 230 ++++++++++++++++++++- 2 files changed, 225 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index be675002f918..93798c8c5d54 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index f18a15a1f275..ae71f15f790d 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -30,6 +30,14 @@ #define F_SEAL_EXEC 0x0020 +#define F_WX_SEALS (F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + +#define MFD_NOEXEC_SEAL 0x0008U + /* * Default is not to test hugetlbfs */ @@ -80,6 +88,37 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) return fd; } +static void sysctl_assert_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) < 0) { + printf("write sysctl failed\n"); + abort(); + } +} + +static void sysctl_fail_write(const char *val) +{ + int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed\n"); + abort(); + } + + if (write(fd, val, strlen(val)) >= 0) { + printf("write sysctl %s succeeded, but failure expected\n", + val); + abort(); + } +} + static int mfd_assert_reopen_fd(int fd_in) { int fd; @@ -758,6 +797,9 @@ static void test_create(void) mfd_fail_new("", ~0); mfd_fail_new("", 0x80000000U); + /* verify EXEC and NOEXEC_SEAL can't both be set */ + mfd_fail_new("", MFD_EXEC | MFD_NOEXEC_SEAL); + /* verify MFD_CLOEXEC is allowed */ fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); @@ -969,20 +1011,21 @@ static void test_seal_resize(void) /* * Test SEAL_EXEC - * Test that chmod() cannot change x bits after sealing + * Test fd is created with exec and allow sealing. + * chmod() cannot change x bits after sealing. */ -static void test_seal_exec(void) +static void test_exec_seal(void) { int fd; printf("%s SEAL-EXEC\n", memfd_str); + printf("%s Apply SEAL_EXEC\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_exec", mfd_def_size, - MFD_CLOEXEC | MFD_ALLOW_SEALING); + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); mfd_assert_mode(fd, 0777); - mfd_assert_chmod(fd, 0644); mfd_assert_has_seals(fd, 0); @@ -996,8 +1039,179 @@ static void test_seal_exec(void) mfd_fail_chmod(fd, 0700); mfd_fail_chmod(fd, 0100); mfd_assert_chmod(fd, 0666); - + mfd_assert_write(fd); close(fd); + + printf("%s Apply ALL_SEALS\n", memfd_str); + fd = mfd_assert_new("kern_memfd_seal_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_EXEC); + + mfd_assert_mode(fd, 0777); + mfd_assert_chmod(fd, 0700); + + mfd_assert_has_seals(fd, 0); + mfd_assert_add_seals(fd, F_SEAL_EXEC); + mfd_assert_has_seals(fd, F_WX_SEALS); + + mfd_fail_chmod(fd, 0711); + mfd_fail_chmod(fd, 0600); + mfd_fail_write(fd); + close(fd); +} + +/* + * Test EXEC_NO_SEAL + * Test fd is created with exec and not allow sealing. + */ +static void test_exec_no_seal(void) +{ + int fd; + + printf("%s EXEC_NO_SEAL\n", memfd_str); + + /* Create with EXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_exec_no_sealing", + mfd_def_size, + MFD_CLOEXEC | MFD_EXEC); + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, F_SEAL_SEAL); + mfd_assert_chmod(fd, 0666); + close(fd); +} + +/* + * Test memfd_create with MFD_NOEXEC flag + */ +static void test_noexec_seal(void) +{ + int fd; + + printf("%s NOEXEC_SEAL\n", memfd_str); + + /* Create with NOEXEC and ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); + + /* Create with NOEXEC but without ALLOW_SEALING */ + fd = mfd_assert_new("kern_memfd_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_NOEXEC_SEAL); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static void test_sysctl_child(void) +{ + int fd; + + printf("%s sysctl 0\n", memfd_str); + sysctl_assert_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_0", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, 0); + mfd_assert_chmod(fd, 0644); + close(fd); + + printf("%s sysctl 1\n", memfd_str); + sysctl_assert_write("1"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + sysctl_fail_write("0"); + close(fd); + + printf("%s sysctl 2\n", memfd_str); + sysctl_assert_write("2"); + mfd_fail_new("kern_memfd_sysctl_2", + MFD_CLOEXEC | MFD_ALLOW_SEALING); + sysctl_fail_write("0"); + sysctl_fail_write("1"); +} + +static int newpid_thread_fn(void *arg) +{ + test_sysctl_child(); + return 0; +} + +static void test_sysctl_child2(void) +{ + int fd; + + sysctl_fail_write("0"); + fd = mfd_assert_new("kern_memfd_sysctl_1", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); +} + +static int newpid_thread_fn2(void *arg) +{ + test_sysctl_child2(); + return 0; +} +static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)) +{ + uint8_t *stack; + pid_t pid; + + stack = malloc(STACK_SIZE); + if (!stack) { + printf("malloc(STACK_SIZE) failed: %m\n"); + abort(); + } + + pid = clone(fn, + stack + STACK_SIZE, + SIGCHLD | flags, + NULL); + if (pid < 0) { + printf("clone() failed: %m\n"); + abort(); + } + + return pid; +} + +static void join_newpid_thread(pid_t pid) +{ + waitpid(pid, NULL, 0); +} + +/* + * Test sysctl + * A very basic sealing test to see whether setting/retrieving seals works. + */ +static void test_sysctl(void) +{ + int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn); + + join_newpid_thread(pid); + + printf("%s child ns\n", memfd_str); + sysctl_assert_write("1"); + + pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); + join_newpid_thread(pid); } /* @@ -1173,13 +1387,15 @@ int main(int argc, char **argv) test_create(); test_basic(); + test_exec_seal(); + test_exec_no_seal(); + test_noexec_seal(); test_seal_write(); test_seal_future_write(); test_seal_shrink(); test_seal_grow(); test_seal_resize(); - test_seal_exec(); test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); @@ -1195,6 +1411,8 @@ int main(int argc, char **argv) test_share_fork("SHARE-FORK", SHARED_FT_STR); join_idle_thread(pid); + test_sysctl(); + printf("memfd: DONE\n"); return 0; From 379c2e60e82ff71510a949033bf8431f39f66c75 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 12 Dec 2022 15:50:42 -0800 Subject: [PATCH 008/505] hugetlb: update vma flag check for hugetlb vma lock The check for whether a hugetlb vma lock exists partially depends on the vma's flags. Currently, it checks for either VM_MAYSHARE or VM_SHARED. The reason both flags are used is because VM_MAYSHARE was previously cleared in hugetlb vmas as they are tore down. This is no longer the case, and only the VM_MAYSHARE check is required. Link: https://lkml.kernel.org/r/20221212235042.178355-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: James Houghton Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92b3fd01a652..ed1ac2df582c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -262,8 +262,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) */ static bool __vma_shareable_lock(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && - vma->vm_private_data; + return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; } void hugetlb_vma_lock_read(struct vm_area_struct *vma) From 243b1f2d3b09b6c813c2e4a179cdf5bfc878eb54 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:52 -0500 Subject: [PATCH 009/505] mm/hugetlb: let vma_offset_start() to return start Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd unshare", v4. Problem ======= huge_pte_offset() is a major helper used by hugetlb code paths to walk a hugetlb pgtable. It's used mostly everywhere since that's needed even before taking the pgtable lock. huge_pte_offset() is always called with mmap lock held with either read or write. It was assumed to be safe but it's actually not. One race condition can easily trigger by: (1) firstly trigger pmd share on a memory range, (2) do huge_pte_offset() on the range, then at the meantime, (3) another thread unshare the pmd range, and the pgtable page is prone to lost if the other shared process wants to free it completely (by either munmap or exit mm). The recent work from Mike on vma lock can resolve most of this already. It's achieved by forbidden pmd unsharing during the lock being taken, so no further risk of the pgtable page being freed. It means if we can take the vma lock around all huge_pte_offset() callers it'll be safe. There're already a bunch of them that we did as per the latest mm-unstable, but also quite a few others that we didn't for various reasons especially on huge_pte_offset() usage. One more thing to mention is that besides the vma lock, i_mmap_rwsem can also be used to protect the pgtable page (along with its pgtable lock) from being freed from under us. IOW, huge_pte_offset() callers need to either hold the vma lock or i_mmap_rwsem to safely walk the pgtables. A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is very hard to trigger, one needs to apply another kernel delay patch too, see below): ======8<======= #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #define MSIZE (1UL << 30) /* 1GB */ #define PSIZE (2UL << 20) /* 2MB */ #define HOLD_SEC (1) int pipefd[2]; void *buf; void *do_map(int fd) { unsigned char *tmpbuf, *p; int ret; ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE); if (ret) { perror("posix_memalign() failed"); return NULL; } tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); if (tmpbuf == MAP_FAILED) { perror("mmap() failed"); return NULL; } printf("mmap() -> %p\n", tmpbuf); for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) { *p = 1; } return tmpbuf; } void do_unmap(void *buf) { munmap(buf, MSIZE); } void proc2(int fd) { unsigned char c; buf = do_map(fd); if (!buf) return; read(pipefd[0], &c, 1); /* * This frees the shared pgtable page, causing use-after-free in * proc1_thread1 when soft walking hugetlb pgtable. */ do_unmap(buf); printf("Proc2 quitting\n"); } void *proc1_thread1(void *data) { /* * Trigger follow-page on 1st 2m page. Kernel hack patch needed to * withhold this procedure for easier reproduce. */ madvise(buf, PSIZE, MADV_POPULATE_WRITE); printf("Proc1-thread1 quitting\n"); return NULL; } void *proc1_thread2(void *data) { unsigned char c; /* Wait a while until proc1_thread1() start to wait */ sleep(0.5); /* Trigger pmd unshare */ madvise(buf, PSIZE, MADV_DONTNEED); /* Kick off proc2 to release the pgtable */ write(pipefd[1], &c, 1); printf("Proc1-thread2 quitting\n"); return NULL; } void proc1(int fd) { pthread_t tid1, tid2; int ret; buf = do_map(fd); if (!buf) return; ret = pthread_create(&tid1, NULL, proc1_thread1, NULL); assert(ret == 0); ret = pthread_create(&tid2, NULL, proc1_thread2, NULL); assert(ret == 0); /* Kick the child to share the PUD entry */ pthread_join(tid1, NULL); pthread_join(tid2, NULL); do_unmap(buf); } int main(void) { int fd, ret; fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB); if (fd < 0) { perror("open failed"); return -1; } ret = ftruncate(fd, MSIZE); if (ret) { perror("ftruncate() failed"); return -1; } ret = pipe(pipefd); if (ret) { perror("pipe() failed"); return -1; } if (fork()) { proc1(fd); } else { proc2(fd); } close(pipefd[0]); close(pipefd[1]); close(fd); return 0; } ======8<======= The kernel patch needed to present such a race so it'll trigger 100%: ======8<======= : diff --git a/mm/hugetlb.c b/mm/hugetlb.c : index 9d97c9a2a15d..f8d99dad5004 100644 : --- a/mm/hugetlb.c : +++ b/mm/hugetlb.c : @@ -38,6 +38,7 @@ : #include : #include : #include : +#include : : #include : #include : @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, : bool unshare = false; : int absent; : struct page *page; : + unsigned long c = 0; : : /* : * If we have a pending SIGKILL, don't keep faulting pages and : @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, : */ : pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), : huge_page_size(h)); : + : + pr_info("%s: withhold 1 sec...\n", __func__); : + for (c = 0; c < 100; c++) { : + udelay(10000); : + } : + pr_info("%s: withhold 1 sec...done\n", __func__); : + : if (pte) : ptl = huge_pte_lock(h, mm, pte); : absent = !pte || huge_pte_none(huge_ptep_get(pte)); : ======8<======= It'll trigger use-after-free of the pgtable spinlock: ======8<======= [ 16.959907] follow_hugetlb_page: withhold 1 sec... [ 17.960315] follow_hugetlb_page: withhold 1 sec...done [ 17.960550] ------------[ cut here ]------------ [ 17.960742] DEBUG_LOCKS_WARN_ON(1) [ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0 [ 17.961264] Modules linked in: [ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46 [ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0 [ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9 [ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096 [ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000 [ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff [ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58 [ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000 [ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001 [ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000 [ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0 [ 17.965956] PKRU: 55555554 [ 17.966068] Call Trace: [ 17.966172] [ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30 [ 17.966455] lock_acquire+0xbf/0x2b0 [ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.966799] ? _printk+0x48/0x4e [ 17.966934] _raw_spin_lock+0x2f/0x40 [ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4 [ 17.967473] __get_user_pages+0xbb/0x620 [ 17.967635] faultin_vma_page_range+0x9a/0x100 [ 17.967817] madvise_vma_behavior+0x3c0/0xbd0 [ 17.967998] ? mas_prev+0x11/0x290 [ 17.968141] ? find_vma_prev+0x5e/0xa0 [ 17.968304] ? madvise_vma_anon_name+0x70/0x70 [ 17.968486] madvise_walk_vmas+0xa9/0x120 [ 17.968650] do_madvise.part.0+0xfa/0x270 [ 17.968813] __x64_sys_madvise+0x5a/0x70 [ 17.968974] do_syscall_64+0x37/0x90 [ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 17.969329] RIP: 0033:0x7f1840f0efdb [ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68 [ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c [ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb [ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000 [ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f [ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80 [ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000 [ 17.972083] [ 17.972199] irq event stamp: 2353 [ 17.972372] hardirqs last enabled at (2353): [] __up_console_sem+0x5e/0x70 [ 17.972869] hardirqs last disabled at (2352): [] __up_console_sem+0x43/0x70 [ 17.973365] softirqs last enabled at (2330): [] __irq_exit_rcu+0xed/0x160 [ 17.973857] softirqs last disabled at (2323): [] __irq_exit_rcu+0xed/0x160 [ 17.974341] ---[ end trace 0000000000000000 ]--- [ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8 [ 17.975012] #PF: supervisor read access in kernel mode [ 17.975314] #PF: error_code(0x0000) - not-present page [ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0 [ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46 [ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0 [ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f [ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046 [ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000 [ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff [ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58 [ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000 [ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001 [ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000 [ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0 [ 17.986674] PKRU: 55555554 [ 17.986832] Call Trace: [ 17.987012] [ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30 [ 17.987770] lock_acquire+0xbf/0x2b0 [ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.988575] ? _printk+0x48/0x4e [ 17.988889] _raw_spin_lock+0x2f/0x40 [ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4 [ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4 [ 17.990119] __get_user_pages+0xbb/0x620 [ 17.990500] faultin_vma_page_range+0x9a/0x100 [ 17.990928] madvise_vma_behavior+0x3c0/0xbd0 [ 17.991354] ? mas_prev+0x11/0x290 [ 17.991678] ? find_vma_prev+0x5e/0xa0 [ 17.992024] ? madvise_vma_anon_name+0x70/0x70 [ 17.992421] madvise_walk_vmas+0xa9/0x120 [ 17.992793] do_madvise.part.0+0xfa/0x270 [ 17.993166] __x64_sys_madvise+0x5a/0x70 [ 17.993539] do_syscall_64+0x37/0x90 [ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd ======8<======= Resolution ========== This patchset protects all the huge_pte_offset() callers to also take the vma lock properly. Patch Layout ============ Patch 1-2: cleanup, or dependency of the follow up patches Patch 3: before fixing, document huge_pte_offset() on lock required Patch 4-8: each patch resolves one possible race condition Patch 9: introduce hugetlb_walk() to replace huge_pte_offset() Tests ===== The series is verified with the above reproducer so the race cannot trigger anymore. It also passes all hugetlb kselftests. This patch (of 9): Even though vma_offset_start() is named like that, it's not returning "the start address of the range" but rather the offset we should use to offset the vma->vm_start address. Make it return the real value of the start vaddr, and it also helps for all the callers because whenever the retval is used, it'll be ultimately added into the vma->vm_start anyway, so it's better. Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 790d2727141a..fdb16246f46e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,12 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { + unsigned long offset = 0; + if (vma->vm_pgoff < start) - return (start - vma->vm_pgoff) << PAGE_SHIFT; - else - return 0; + offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + + return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) @@ -457,7 +459,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -473,8 +475,8 @@ retry: break; } - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, ZAP_FLAG_DROP_MARKER); + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } @@ -507,10 +509,9 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) - unmap_hugepage_range(vma, vma->vm_start + v_start, - v_end, NULL, - ZAP_FLAG_DROP_MARKER); + if (hugetlb_vma_maps_page(vma, v_start, page)) + unmap_hugepage_range(vma, v_start, v_end, NULL, + ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); @@ -540,8 +541,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, - NULL, zap_flags); + unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private From bb373dce2c7b473023f9e69f041a22d81171b71a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:53 -0500 Subject: [PATCH 010/505] mm/hugetlb: don't wait for migration entry during follow page That's what the code does with !hugetlb pages, so we should logically do the same for hugetlb, so migration entry will also be treated as no page. This is probably also the last piece in follow_page code that may sleep, the last one should be removed in cf994dd8af27 ("mm/gup: remove FOLL_MIGRATION", 2022-11-16). Link: https://lkml.kernel.org/r/20221216155100.2043537-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1ac2df582c..549f79668756 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6401,7 +6401,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; -retry: pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) return NULL; @@ -6424,16 +6423,6 @@ retry: page = NULL; goto out; } - } else { - if (is_hugetlb_entry_migration(entry)) { - spin_unlock(ptl); - __migration_entry_wait_huge(pte, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ } out: spin_unlock(ptl); From fe7d4c6d5a42f5bdc63fdfdca2cad32c8a779e23 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:54 -0500 Subject: [PATCH 011/505] mm/hugetlb: document huge_pte_offset usage huge_pte_offset() is potentially a pgtable walker, looking up pte_t* for a hugetlb address. Normally, it's always safe to walk a generic pgtable as long as we're with the mmap lock held for either read or write, because that guarantees the pgtable pages will always be valid during the process. But it's not true for hugetlbfs, especially shared: hugetlbfs can have its pgtable freed by pmd unsharing, it means that even with mmap lock held for current mm, the PMD pgtable page can still go away from under us if pmd unsharing is possible during the walk. So we have two ways to make it safe even for a shared mapping: (1) If we're with the hugetlb vma lock held for either read/write, it's okay because pmd unshare cannot happen at all. (2) If we're with the i_mmap_rwsem lock held for either read/write, it's okay because even if pmd unshare can happen, the pgtable page cannot be freed from under us. Document it. Link: https://lkml.kernel.org/r/20221216155100.2043537-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..d755e2a7c0db 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -192,6 +192,38 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); +/* + * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. + * Returns the pte_t* if found, or NULL if the address is not mapped. + * + * Since this function will walk all the pgtable pages (including not only + * high-level pgtable page, but also PUD entry that can be unshared + * concurrently for VM_SHARED), the caller of this function should be + * responsible of its thread safety. One can follow this rule: + * + * (1) For private mappings: pmd unsharing is not possible, so holding the + * mmap_lock for either read or write is sufficient. Most callers + * already hold the mmap_lock, so normally, no special action is + * required. + * + * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged + * pgtable page can go away from under us! It can be done by a pmd + * unshare with a follow up munmap() on the other process), then we + * need either: + * + * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare + * won't happen upon the range (it also makes sure the pte_t we + * read is the right and stable one), or, + * + * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make + * sure even if unshare happened the racy unmap() will wait until + * i_mmap_rwsem is released. + * + * Option (2.1) is the safest, which guarantees pte stability from pmd + * sharing pov, until the vma lock released. Option (2.2) doesn't protect + * a concurrent pmd unshare, but it makes sure the pgtable page is safe to + * access. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); From fcd48540d188876c917a377d81cd24c100332a62 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:55 -0500 Subject: [PATCH 012/505] mm/hugetlb: move swap entry handling into vma lock when faulted In hugetlb_fault(), there used to have a special path to handle swap entry at the entrance using huge_pte_offset(). That's unsafe because huge_pte_offset() for a pmd sharable range can access freed pgtables if without any lock to protect the pgtable from being freed after pmd unshare. Here the simplest solution to make it safe is to move the swap handling to be after the vma lock being held. We may need to take the fault mutex on either migration or hwpoison entries now (also the vma lock, but that's really needed), however neither of them is hot path. Note that the vma lock cannot be released in hugetlb_fault() when the migration entry is detected, because in migration_entry_wait_huge() the pgtable page will be used again (by taking the pgtable lock), so that also need to be protected by the vma lock. Modify migration_entry_wait_huge() so that it must be called with vma read lock held, and properly release the lock in __migration_entry_wait_huge(). Link: https://lkml.kernel.org/r/20221216155100.2043537-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ++++-- mm/hugetlb.c | 37 ++++++++++++++++--------------------- mm/migrate.c | 25 +++++++++++++++++++++---- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b982dd614572..3a451b7afcb3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); #ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); #endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } #ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } #endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 549f79668756..7f9db1d9f6a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5993,22 +5993,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); - if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ - entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, ptep); - return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - } - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate @@ -6023,10 +6007,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire vma lock before calling huge_pte_alloc and hold * until finished with ptep. This prevents huge_pmd_unshare from * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. */ hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); @@ -6055,8 +6035,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * properly handle it. */ - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) { + /* + * Release the hugetlb fault lock now, but retain + * the vma lock, because it is needed to guard the + * huge_pte_lockptr() later in + * migration_entry_wait_huge(). The vma lock will + * be released there. + */ + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + migration_entry_wait_huge(vma, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; + } /* * If we are going to COW/unshare the mapping later, we examine the diff --git a/mm/migrate.c b/mm/migrate.c index a4d3fc65085f..98de7ce2b576 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, } #ifdef CONFIG_HUGETLB_PAGE -void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) +/* + * The vma read lock must be held upon entry. Holding that lock prevents either + * the pte or the ptl from being freed. + * + * This function will release the vma lock before returning. + */ +void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { pte_t pte; + hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(ptep); - if (unlikely(!is_hugetlb_entry_migration(pte))) + if (unlikely(!is_hugetlb_entry_migration(pte))) { spin_unlock(ptl); - else + hugetlb_vma_unlock_read(vma); + } else { + /* + * If migration entry existed, safe to release vma lock + * here because the pgtable page won't be freed without the + * pgtable lock released. See comment right above pgtable + * lock release in migration_entry_wait_on_locked(). + */ + hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + } } void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - __migration_entry_wait_huge(pte, ptl); + __migration_entry_wait_huge(vma, pte, ptl); } #endif From b8da2e466000b232a6b67072bbef375061142daa Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:17 -0500 Subject: [PATCH 013/505] mm/hugetlb: make userfaultfd_huge_must_wait() safe to pmd unshare We can take the hugetlb walker lock, here taking vma lock directly. Link: https://lkml.kernel.org/r/20221216155217.2043700-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cc694846617a..3b1797e0448a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -391,7 +391,8 @@ static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) */ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = vmf->vma->vm_mm; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; @@ -418,7 +419,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) */ mmap_assert_locked(mm); - ctx = vmf->vma->vm_userfaultfd_ctx.ctx; + ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -508,6 +509,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) blocking_state = userfaultfd_get_blocking_state(vmf->flags); + /* + * Take the vma lock now, in order to safely call + * userfaultfd_huge_must_wait() later. Since acquiring the + * (sleepable) vma lock can modify the current task state, that + * must be before explicitly calling set_current_state(). + */ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland @@ -522,13 +532,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vmf->vma)) + if (!is_vm_hugetlb_page(vma)) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + must_wait = userfaultfd_huge_must_wait(ctx, vma, vmf->address, vmf->flags, reason); + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); mmap_read_unlock(mm); if (likely(must_wait && !READ_ONCE(ctx->released))) { From 7d049f3a03ea705522210d70b9d3e223ef86d663 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:19 -0500 Subject: [PATCH 014/505] mm/hugetlb: make hugetlb_follow_page_mask() safe to pmd unshare Since hugetlb_follow_page_mask() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155219.2043714-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7f9db1d9f6a5..807edc1410e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6396,9 +6396,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; + hugetlb_vma_lock_read(vma); pte = huge_pte_offset(mm, haddr, huge_page_size(h)); if (!pte) - return NULL; + goto out_unlock; ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); @@ -6421,6 +6422,8 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } out: spin_unlock(ptl); +out_unlock: + hugetlb_vma_unlock_read(vma); return page; } From eefc7fa53608920203a1402ecf7255ecfa8bb030 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:23 -0500 Subject: [PATCH 015/505] mm/hugetlb: make follow_hugetlb_page() safe to pmd unshare Since follow_hugetlb_page() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155223.2043727-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 807edc1410e5..da4c37553c08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6454,6 +6454,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } + hugetlb_vma_lock_read(vma); /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the @@ -6478,6 +6479,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !hugetlbfs_pagecache_present(h, vma, vaddr)) { if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; break; } @@ -6499,6 +6501,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte) spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; else if (unshare) @@ -6561,6 +6565,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, remainder -= pages_per_huge_page(h); i += pages_per_huge_page(h); spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); continue; } @@ -6590,6 +6595,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); remainder = 0; err = -ENOMEM; break; @@ -6601,6 +6607,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, i += refs; spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); } *nr_pages = remainder; /* From dd361e5033cf36c51acab996ea17748b81cedb38 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:26 -0500 Subject: [PATCH 016/505] mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 11 ++++++++++- mm/hmm.c | 15 ++++++++++++++- mm/pagewalk.c | 2 ++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 959f52e5867d..27a6df448ee5 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -21,7 +21,16 @@ struct mm_walk; * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) * are skipped. - * @hugetlb_entry: if set, called for each hugetlb entry + * @hugetlb_entry: if set, called for each hugetlb entry. This hook + * function is called with the vma lock held, in order to + * protect against a concurrent freeing of the pte_t* or + * the ptl. In some cases, the hook function needs to drop + * and retake the vma lock in order to avoid deadlocks + * while calling other functions. In such cases the hook + * function must either refrain from accessing the pte or + * ptl after dropping the vma lock, or else revalidate + * those items after re-acquiring the vma lock and before + * accessing them. * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means * "do page table walk over the current vma", returning diff --git a/mm/hmm.c b/mm/hmm.c index 601a99ce3c84..6a151c09de5e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { + int ret; + spin_unlock(ptl); - return hmm_vma_fault(addr, end, required_fault, walk); + hugetlb_vma_unlock_read(vma); + /* + * Avoid deadlock: drop the vma lock before calling + * hmm_vma_fault(), which will itself potentially take and + * drop the vma lock. This is also correct from a + * protection point of view, because there is no further + * use here of either pte or ptl after dropping the vma + * lock. + */ + ret = hmm_vma_fault(addr, end, required_fault, walk); + hugetlb_vma_lock_read(vma); + return ret; } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7f1c9b274906..d98564a7be57 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, const struct mm_walk_ops *ops = walk->ops; int err = 0; + hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = huge_pte_offset(walk->mm, addr & hmask, sz); @@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, if (err) break; } while (addr = next, addr != end); + hugetlb_vma_unlock_read(vma); return err; } From 9c67a20704e763f9cb8cd262c3e45de7bd2816bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:29 -0500 Subject: [PATCH 017/505] mm/hugetlb: introduce hugetlb_walk() huge_pte_offset() is the main walker function for hugetlb pgtables. The name is not really representing what it does, though. Instead of renaming it, introduce a wrapper function called hugetlb_walk() which will use huge_pte_offset() inside. Assert on the locks when walking the pgtable. Note, the vma lock assertion will be a no-op for private mappings. Document the last special case in the page_vma_mapped_walk() path where we don't need any more lock to call hugetlb_walk(). Taking vma lock there is not needed because either: (1) potential callers of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2) the caller will not walk a hugetlb vma at all so the hugetlb code path not reachable (e.g. in ksm or uprobe paths). It's slightly implicit for future page_vma_mapped_walk() callers on that lock requirement. But anyway, when one day this rule breaks, one will get a straightforward warning in hugetlb_walk() with lockdep, then there'll be a way out. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20221216155229.2043750-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 +--- fs/userfaultfd.c | 6 ++---- include/linux/hugetlb.h | 37 +++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 31 +++++++++++++------------------ mm/page_vma_mapped.c | 9 ++++++--- mm/pagewalk.c | 4 +--- 6 files changed, 60 insertions(+), 31 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fdb16246f46e..48f1a8ad2243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, { pte_t *ptep, pte; - ptep = huge_pte_offset(vma->vm_mm, addr, - huge_page_size(hstate_vma(vma))); - + ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3b1797e0448a..15a5bf765d43 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -252,14 +252,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, unsigned long flags, unsigned long reason) { - struct mm_struct *mm = ctx->mm; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(mm); - - ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + mmap_assert_locked(ctx->mm); + ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); if (!ptep) goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d755e2a7c0db..b6b10101bea7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include #include #include #include @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * + * IMPORTANT: we should normally not directly call this function, instead + * this is only a common interface to implement arch-specific + * walker. Please use hugetlb_walk() instead, because that will attempt to + * verify the locking for you. + * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be @@ -1229,4 +1235,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif +static inline bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; +} + +/* + * Safe version of huge_pte_offset() to check the locks. See comments + * above huge_pte_offset(). + */ +static inline pte_t * +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) +{ +#if defined(CONFIG_HUGETLB_PAGE) && \ + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * If pmd sharing possible, locking needed to safely walk the + * hugetlb pgtables. More information can be found at the comment + * above huge_pte_offset() in the same file. + * + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. + */ + if (__vma_shareable_lock(vma)) + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && + !lockdep_is_held( + &vma->vm_file->f_mapping->i_mmap_rwsem)); +#endif + return huge_pte_offset(vma->vm_mm, addr, sz); +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da4c37553c08..0e5441d6890a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -260,11 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) /* * hugetlb vma_lock helper routines */ -static bool __vma_shareable_lock(struct vm_area_struct *vma) -{ - return vma->vm_flags & VM_MAYSHARE && vma->vm_private_data; -} - void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { @@ -4980,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else { /* * For shared mappings the vma lock must be held before - * calling huge_pte_offset in the src vma. Otherwise, the + * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ @@ -4990,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); + src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; @@ -5197,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { - src_pte = huge_pte_offset(mm, old_addr, sz); + src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; @@ -5260,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; @@ -5573,7 +5568,7 @@ retry_avoidcopy: mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -5611,7 +5606,7 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -6397,7 +6392,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, return NULL; hugetlb_vma_lock_read(vma); - pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + pte = hugetlb_walk(vma, haddr, huge_page_size(h)); if (!pte) goto out_unlock; @@ -6462,8 +6457,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), - huge_page_size(h)); + pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -6654,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address, psize); + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; @@ -7064,8 +7059,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr, - vma_mmu_pagesize(svma)); + spte = hugetlb_walk(svma, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -7377,7 +7372,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - ptep = huge_pte_offset(mm, address, sz); + ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 93e13fc17d3c..4e448cfbc6ef 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); - - /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, size); + /* + * All callers that get here will already hold the + * i_mmap_rwsem. Therefore, no additional locks need to be + * taken before calling hugetlb_walk(). + */ + pvmw->pte = hugetlb_walk(vma, pvmw->address, size); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d98564a7be57..cb23f8a15c13 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -305,13 +305,11 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask, sz); - + pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); - if (err) break; } while (addr = next, addr != end); From d685c668b0695dff927c85e27ef27d4f404f16a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:51 +0000 Subject: [PATCH 018/505] buffer: add b_folio as an alias of b_page Patch series "Start converting buffer_heads to use folios". I was hoping that filesystems would convert from buffer_heads to iomap, but that's not happening particularly quickly. So the buffer_head infrastructure needs to be converted from being page-based to being folio-based. This patch (of 12): Buffer heads point to the allocation (ie the folio), not the page. This is currently the same thing for all filesystems that use buffer heads, so this is a safe transitional step. Link: https://lkml.kernel.org/r/20221215214402.3522366-1-willy@infradead.org Link: https://lkml.kernel.org/r/20221215214402.3522366-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 33fa5e94aa80..8f14dca5fed7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ - struct page *b_page; /* the page this bh is mapped to */ + union { + struct page *b_page; /* the page this bh is mapped to */ + struct folio *b_folio; /* the folio this bh is mapped to */ + }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ From abc8a8a2c7dc7b557619befa8fb29be60ed481bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:52 +0000 Subject: [PATCH 019/505] buffer: replace obvious uses of b_page with b_folio These cases just check if it's NULL, or use b_page to get to the page's address space. They are assumptions that b_page never points to a tail page. Link: https://lkml.kernel.org/r/20221215214402.3522366-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index d9c6d1fbb6dd..e1055fe0b366 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -321,7 +321,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { /* Decrypt if needed */ if (uptodate && - fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) { + fscrypt_inode_uses_fs_layer_crypto(bh->b_folio->mapping->host)) { struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { @@ -570,7 +570,7 @@ void write_boundary_block(struct block_device *bdev, void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); if (!mapping->private_data) { @@ -1073,7 +1073,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1117,8 +1117,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ - if (bh->b_page && bh->b_page->mapping) - mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_folio && bh->b_folio->mapping) + mapping_set_error(bh->b_folio->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); rcu_read_lock(); @@ -1154,7 +1154,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (bh->b_assoc_map) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = bh->b_folio->mapping; spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); From 03c5f331234c5798965fa654783dbed1c792c7f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:53 +0000 Subject: [PATCH 020/505] buffer: use b_folio in touch_buffer() Removes a call to compound_head() in this path. Link: https://lkml.kernel.org/r/20221215214402.3522366-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index e1055fe0b366..8a02fdaeec9a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -60,7 +60,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, inline void touch_buffer(struct buffer_head *bh) { trace_block_touch_buffer(bh); - mark_page_accessed(bh->b_page); + folio_mark_accessed(bh->b_folio); } EXPORT_SYMBOL(touch_buffer); From 2e2dba15d107491e972041acb2d0b7bd73b92bc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:54 +0000 Subject: [PATCH 021/505] buffer: use b_folio in end_buffer_async_read() Removes a call to compound_head() in SetPageError(), saving 76 bytes of text. Link: https://lkml.kernel.org/r/20221215214402.3522366-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8a02fdaeec9a..5bdcc040eca3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -246,18 +246,18 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; - int page_uptodate = 1; + struct folio *folio; + int folio_uptodate = 1; BUG_ON(!buffer_async_read(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); buffer_io_error(bh, ", async page read"); - SetPageError(page); + folio_set_error(folio); } /* @@ -265,14 +265,14 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; do { if (!buffer_uptodate(tmp)) - page_uptodate = 0; + folio_uptodate = 0; if (buffer_async_read(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; @@ -285,9 +285,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * If all of the buffers are uptodate then we can set the page * uptodate. */ - if (page_uptodate) - SetPageUptodate(page); - unlock_page(page); + if (folio_uptodate) + folio_mark_uptodate(folio); + folio_unlock(folio); return; still_busy: From 743ed81ec11147b42085a73a2e408964674291a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:55 +0000 Subject: [PATCH 022/505] buffer: use b_folio in end_buffer_async_write() Save 76 bytes from avoiding the call to compound_head() in SetPageError(). Also avoid the call to compound_head() in end_page_writeback(). Link: https://lkml.kernel.org/r/20221215214402.3522366-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 5bdcc040eca3..c44ca40530c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -344,21 +344,21 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; - struct page *page; + struct folio *folio; BUG_ON(!buffer_async_write(bh)); - page = bh->b_page; + folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); - SetPageError(page); + folio_set_error(folio); } - first = page_buffers(page); + first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); @@ -372,7 +372,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) tmp = tmp->b_this_page; } spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - end_page_writeback(page); + folio_end_writeback(folio); return; still_busy: From c10d91194d5d630a0befc7bc116aba3cfda8a226 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:56 +0000 Subject: [PATCH 023/505] page_io: remove buffer_head include page_io never uses buffer_heads to do I/O. Link: https://lkml.kernel.org/r/20221215214402.3522366-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- mm/page_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index 3a5f921b932e..905d9fcc0c96 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include From cf1d3417e634b2b2dd162a7e193af9a9d700431b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:57 +0000 Subject: [PATCH 024/505] buffer: use b_folio in mark_buffer_dirty() Removes about four calls to compound_head(). Two of them are inline which removes 132 bytes from the kernel text. Link: https://lkml.kernel.org/r/20221215214402.3522366-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index c44ca40530c3..7e42d67bcaad 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1095,16 +1095,16 @@ void mark_buffer_dirty(struct buffer_head *bh) } if (!test_set_buffer_dirty(bh)) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - mapping = page_mapping(page); + folio_memcg_lock(folio); + if (!folio_test_set_dirty(folio)) { + mapping = folio->mapping; if (mapping) - __set_page_dirty(page, mapping, 0); + __folio_mark_dirty(folio, mapping, 0); } - unlock_page_memcg(page); + folio_memcg_unlock(folio); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } From 11551cf15ecc17c4db4456538fd73d284ffcf20b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:58 +0000 Subject: [PATCH 025/505] gfs2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space. Link: https://lkml.kernel.org/r/20221215214402.3522366-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/gfs2/glops.c | 2 +- fs/gfs2/log.c | 2 +- fs/gfs2/meta_io.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d78b61ecc1cd..081422644ec5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -39,7 +39,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_page->mapping, bh->b_page->flags); + bh->b_folio->mapping, bh->b_folio->flags); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 723639376ae2..1fcc829f02ab 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -127,7 +127,7 @@ __acquires(&sdp->sd_ail_lock) continue; gl = bd->bd_gl; list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); - mapping = bh->b_page->mapping; + mapping = bh->b_folio->mapping; if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 3c41b864ee5b..924361fa510b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -334,7 +334,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { - struct address_space *mapping = bh->b_page->mapping; + struct address_space *mapping = bh->b_folio->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; struct gfs2_trans *tr = current->journal_info; From 0d22fe2f039e971c2d7cc97d19ce48d8bdae253c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:59 +0000 Subject: [PATCH 026/505] jbd2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or have already been converted to folio. Link: https://lkml.kernel.org/r/20221215214402.3522366-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 8 ++------ fs/jbd2/journal.c | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4810438b7856..96a1ebc6342d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -63,16 +63,12 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) static void release_buffer_page(struct buffer_head *bh) { struct folio *folio; - struct page *page; if (buffer_dirty(bh)) goto nope; if (atomic_read(&bh->b_count) != 1) goto nope; - page = bh->b_page; - if (!page) - goto nope; - folio = page_folio(page); + folio = bh->b_folio; if (folio->mapping) goto nope; @@ -1040,7 +1036,7 @@ restart_loop: * already detached from the mapping and buffers cannot * get reused. */ - mapping = READ_ONCE(bh->b_page->mapping); + mapping = READ_ONCE(bh->b_folio->mapping); if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { clear_buffer_mapped(bh); clear_buffer_new(bh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2696f43e7239..4095fe91457f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2938,7 +2938,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_folio && bh->b_folio->mapping)); if (!new_jh) { jbd_unlock_bh_journal_head(bh); From 6ad4cd7f36000ae145373b68ac61f15a1793b054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:00 +0000 Subject: [PATCH 027/505] nilfs2: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or the index of the page the buffer is in. Link: https://lkml.kernel.org/r/20221215214402.3522366-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/btree.c | 2 +- fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/mdt.c | 4 ++-- fs/nilfs2/segment.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e74fda212620..e956f886a1a1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -188,7 +188,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct page *opage = obh->b_page; lock_page(opage); retry: - /* BUG_ON(oldkey != obh->b_page->index); */ + /* BUG_ON(oldkey != obh->b_folio->index); */ if (unlikely(oldkey != opage->index)) NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 40ce92a332fe..b5f997e5e670 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -398,7 +398,7 @@ int nilfs_btree_broken_node_block(struct buffer_head *bh) if (buffer_nilfs_checked(bh)) return 0; - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index b0d22ff24b67..48fe71d309cb 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -140,7 +140,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) { wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; nilfs_err(inode->i_sb, "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index cbf4fa60eea2..19c8158605ed 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -563,7 +563,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); if (!page) return -ENOMEM; @@ -595,7 +595,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 76c3bd88b858..f7a14ed12a66 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1581,7 +1581,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); - inode = bh->b_page->mapping->host; + inode = bh->b_folio->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; From ac55e78d9e44a5374f5726b94e55f5a7b5d51fb9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:01 +0000 Subject: [PATCH 028/505] reiserfs: replace obvious uses of b_page with b_folio These places just use b_page to get to the buffer's address_space or call page_folio() on b_page to get a folio. Link: https://lkml.kernel.org/r/20221215214402.3522366-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/journal.c | 4 ++-- fs/reiserfs/tail_conversion.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9f62da7471c9..9ce4ec296b74 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -601,7 +601,7 @@ static int journal_list_still_alive(struct super_block *s, */ static void release_buffer_page(struct buffer_head *bh) { - struct folio *folio = page_folio(bh->b_page); + struct folio *folio = bh->b_folio; if (!folio->mapping && folio_trylock(folio)) { folio_get(folio); put_bh(bh); @@ -866,7 +866,7 @@ loop_next: * will ever write the buffer. We're safe if we write the * page one last time after freeing the journal header. */ - if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) { spin_unlock(lock); write_dirty_buffer(bh, 0); spin_lock(lock); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index b0ae088dffc7..2cec61af2a9e 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -177,7 +177,7 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) * BUG() on attempt to write not mapped buffer */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = bh->b_folio->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); From a5fd8390d2b2db15fd043b8bd571b536101222c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:44:02 +0000 Subject: [PATCH 029/505] mpage: use b_folio in do_mpage_readpage() Remove this conversion of a folio back to a page. Link: https://lkml.kernel.org/r/20221215214402.3522366-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- fs/mpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/mpage.c b/fs/mpage.c index 0f8ae954a579..db59cbf6affc 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -198,7 +198,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* * Then do more get_blocks calls until we are done with this folio. */ - map_bh->b_page = &folio->page; + map_bh->b_folio = folio; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; From e976936cfc66376fc740a3a476365273384ce1ce Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Dec 2022 14:45:37 -0500 Subject: [PATCH 030/505] mm/mempolicy: do not duplicate policy if it is not applicable for set_mempolicy_home_node set_mempolicy_home_node tries to duplicate a memory policy before checking it whether it is applicable for the operation. There is no real reason for doing that and it might actually be a pointless memory allocation and deallocation exercise for MPOL_INTERLEAVE. Not a big problem but we can do better. Simply check the policy before acting on it. Link: https://lkml.kernel.org/r/20221216194537.238047-2-mathieu.desnoyers@efficios.com Signed-off-by: Michal Hocko Reviewed-by: Mathieu Desnoyers Signed-off-by: Mathieu Desnoyers Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..becf41e10076 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1489,7 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct mempolicy *new; + struct mempolicy *new, *old; unsigned long vmstart; unsigned long vmend; unsigned long end; @@ -1521,31 +1521,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le return 0; mmap_write_lock(mm); for_each_vma_range(vmi, vma, end) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - new = mpol_dup(vma_policy(vma)); - if (IS_ERR(new)) { - err = PTR_ERR(new); - break; - } - /* - * Only update home node if there is an existing vma policy - */ - if (!new) - continue; - /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ - if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { - mpol_put(new); + old = vma_policy(vma); + if (!old) + continue; + if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } + new = mpol_dup(old); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } new->home_node = home_node; + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); err = mbind_range(mm, vmstart, vmend, new); mpol_put(new); if (err) From 6b1ead5985bf73b7dd4453f5cd3b8690a9c52cd5 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Mon, 12 Dec 2022 13:56:57 +0800 Subject: [PATCH 031/505] lib/test_vmalloc.c: add parameter use_huge for fix_size_alloc_test Add a parameter `use_huge' for fix_size_alloc_test(), which can be used to test allocation vie vmalloc_huge for both functionality and performance. Link: https://lkml.kernel.org/r/20221212055657.698420-1-panqinglin2020@iscas.ac.cn Signed-off-by: Qinglin Pan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index f90d2c27675b..de4ee0d50906 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -38,6 +38,9 @@ __param(int, test_loop_count, 1000000, __param(int, nr_pages, 0, "Set number of pages for fix_size_alloc_test(default: 1)"); +__param(bool, use_huge, false, + "Use vmalloc_huge in fix_size_alloc_test"); + __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" "\t\tid: 1, name: fix_size_alloc_test\n" @@ -264,7 +267,10 @@ static int fix_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); + if (use_huge) + ptr = vmalloc_huge((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE, GFP_KERNEL); + else + ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); if (!ptr) return -1; From cb6c33d4dc09a8fddda1867708956c27615775f4 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 8 Dec 2022 22:21:30 +0800 Subject: [PATCH 032/505] cma: tracing: print alloc result in trace_cma_alloc_finish The result of the allocation attempt is not printed in trace_cma_alloc_finish, but it's important to do it so we can set filters to catch specific errors on allocation or to trigger some operations on specific errors. We have printed the result in log, but the log is conditional and could not be filtered by tracing events. It introduces little overhead to print this result. The result of allocation is named `errorno' in the trace. Link: https://lkml.kernel.org/r/20221208142130.1501195-1-haowenchao@huawei.com Signed-off-by: Wenchao Hao Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/cma.h | 32 +++++++++++++++++++++++++++++--- mm/cma.c | 2 +- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 3d708dae1542..ef75ea606ab2 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -91,12 +91,38 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, +TRACE_EVENT(cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned long count, unsigned int align), + unsigned long count, unsigned int align, int errorno), - TP_ARGS(name, pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align, errorno), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long, pfn) + __field(const struct page *, page) + __field(unsigned long, count) + __field(unsigned int, align) + __field(int, errorno) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->pfn = pfn; + __entry->page = page; + __entry->count = count; + __entry->align = align; + __entry->errorno = errorno; + ), + + TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d", + __get_str(name), + __entry->pfn, + __entry->page, + __entry->count, + __entry->align, + __entry->errorno) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, diff --git a/mm/cma.c b/mm/cma.c index 4a978e09547a..a75b17b03b66 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret); /* * CMA can allocate multiple page blocks, which results in different From fc986a38b670d9b8f6af0596e973976018314a59 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Dec 2022 10:34:30 +0800 Subject: [PATCH 033/505] mm: huge_memory: convert madvise_free_huge_pmd to use a folio Using folios instead of pages removes several calls to compound_head(), Link: https://lkml.kernel.org/r/20221207023431.151008-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 867f02e6061d..3de266e0aeb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1603,7 +1603,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, { spinlock_t *ptl; pmd_t orig_pmd; - struct page *page; + struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false; @@ -1623,15 +1623,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - page = pmd_page(orig_pmd); + folio = pfn_folio(pmd_pfn(orig_pmd)); /* - * If other processes are mapping this page, we couldn't discard - * the page unless they all do MADV_FREE so let's skip the page. + * If other processes are mapping this folio, we couldn't discard + * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (total_mapcount(page) != 1) + if (folio_mapcount(folio) != 1) goto out; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; /* @@ -1639,17 +1639,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * will deactivate only them. */ if (next - addr != HPAGE_PMD_SIZE) { - get_page(page); + folio_get(folio); spin_unlock(ptl); - split_huge_page(page); - unlock_page(page); - put_page(page); + split_folio(folio); + folio_unlock(folio); + folio_put(folio); goto out_unlocked; } - if (PageDirty(page)) - ClearPageDirty(page); - unlock_page(page); + if (folio_test_dirty(folio)) + folio_clear_dirty(folio); + folio_unlock(folio); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); @@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); ret = true; out: spin_unlock(ptl); From 6a6fe9ebd571a4092b7d5c1f11e4e1e15d296fa5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Dec 2022 10:06:18 +0800 Subject: [PATCH 034/505] mm: swap: convert mark_page_lazyfree() to folio_mark_lazyfree() mark_page_lazyfree() and the callers are converted to use folio, this rename and make it to take in a folio argument instead of calling page_folio(). Link: https://lkml.kernel.org/r/20221209020618.190306-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- mm/huge_memory.c | 2 +- mm/madvise.c | 2 +- mm/swap.c | 12 +++++------- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..93f1cebd8545 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -402,7 +402,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void deactivate_page(struct page *page); -extern void mark_page_lazyfree(struct page *page); +void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); extern void lru_cache_add_inactive_or_unevictable(struct page *page, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3de266e0aeb2..266c4b557946 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - mark_page_lazyfree(&folio->page); + folio_mark_lazyfree(folio); ret = true; out: spin_unlock(ptl); diff --git a/mm/madvise.c b/mm/madvise.c index b6ea204d4e23..479d9a32e44a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -728,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(&folio->page); + folio_mark_lazyfree(folio); } out: if (nr_swap) { diff --git a/mm/swap.c b/mm/swap.c index 70e2063ef43a..5e5eba186930 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -757,16 +757,14 @@ void deactivate_page(struct page *page) } /** - * mark_page_lazyfree - make an anon page lazyfree - * @page: page to deactivate + * folio_mark_lazyfree - make an anon folio lazyfree + * @folio: folio to deactivate * - * mark_page_lazyfree() moves @page to the inactive file list. - * This is done to accelerate the reclaim of @page. + * folio_mark_lazyfree() moves @folio to the inactive file list. + * This is done to accelerate the reclaim of @folio. */ -void mark_page_lazyfree(struct page *page) +void folio_mark_lazyfree(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_lru(folio) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { From c5094ec79cbe487983e3a96548a7eb1c1c82c727 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Dec 2022 14:45:07 -0800 Subject: [PATCH 035/505] hugetlb: initialize variable to avoid compiler warning With the gcc 'maybe-uninitialized' warning enabled, gcc will produce: mm/hugetlb.c:6896:20: warning: `chg' may be used uninitialized This is a false positive, but may be difficult for the compiler to determine. maybe-uninitialized is disabled by default, but this gets flagged as a 0-DAY build regression. Initialize the variable to silence the warning. Link: https://lkml.kernel.org/r/20221216224507.106789-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0e5441d6890a..a82f41024167 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6760,7 +6760,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg, add = -1; + long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; From 4e0cf05f60590c4119c2eeacf136d1b832978370 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Dec 2022 18:13:39 +0100 Subject: [PATCH 036/505] mm: memcontrol: skip moving non-present pages that are mapped elsewhere Patch series "mm: push down lock_page_memcg()", v2. This patch (of 3): During charge moving, the pte lock and the page lock cover nearly all cases of stabilizing page_mapped(). The only exception is when we're looking at a non-present pte and find a page in the page cache or in the swapcache: if the page is mapped elsewhere, it can become unmapped outside of our control. For this reason, rmap needs lock_page_memcg(). We don't like cgroup-specific locks in generic MM code - especially in performance-critical MM code - and for a legacy feature that's unlikely to have many users left - if any. So remove the exception. Arguably that's better semantics anyway: the page is shared, and another process seems to be the more active user. Once we stop moving such pages, rmap doesn't need lock_page_memcg() anymore. The next patch will remove it. Link: https://lkml.kernel.org/r/20221206171340.139790-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20221206171340.139790-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..a698a2b6523b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5692,7 +5692,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must make sure the page is not on LRU (isolate_page() is useful.) + * The page must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. @@ -5709,20 +5709,13 @@ static int mem_cgroup_move_account(struct page *page, int nid, ret; VM_BUG_ON(from == to); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON(compound && !folio_test_large(folio)); - /* - * Prevent mem_cgroup_migrate() from looking at - * page's memory cgroup of its source page while we change it. - */ - ret = -EBUSY; - if (!folio_trylock(folio)) - goto out; - ret = -EINVAL; if (folio_memcg(folio) != from) - goto out_unlock; + goto out; pgdat = folio_pgdat(folio); from_vec = mem_cgroup_lruvec(from, pgdat); @@ -5809,8 +5802,6 @@ static int mem_cgroup_move_account(struct page *page, mem_cgroup_charge_statistics(from, -nr_pages); memcg_check_events(from, nid); local_irq_enable(); -out_unlock: - folio_unlock(folio); out: return ret; } @@ -5859,6 +5850,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (target && page) { + if (!trylock_page(page)) { + put_page(page); + return ret; + } + /* + * page_mapped() must be stable during the move. This + * pte is locked, so if it's present, the page cannot + * become unmapped. If it isn't, we have only partial + * control over the mapped state: the page lock will + * prevent new faults against pagecache and swapcache, + * so an unmapped page cannot become mapped. However, + * if the page is already mapped elsewhere, it can + * unmap, and there is nothing we can do about it. + * Alas, skip moving the page in this case. + */ + if (!pte_present(ptent) && page_mapped(page)) { + unlock_page(page); + put_page(page); + return ret; + } + } + if (!page && !ent.val) return ret; if (page) { @@ -5875,8 +5889,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (target) target->page = page; } - if (!ret || !target) + if (!ret || !target) { + if (target) + unlock_page(page); put_page(page); + } } /* * There is a swap entry and a page doesn't exist or isn't charged. @@ -5916,6 +5933,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, ret = MC_TARGET_PAGE; if (target) { get_page(page); + if (!trylock_page(page)) { + put_page(page); + return MC_TARGET_NONE; + } target->page = page; } } @@ -6154,6 +6175,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } putback_lru_page(page); } + unlock_page(page); put_page(page); } else if (target_type == MC_TARGET_DEVICE) { page = target.page; @@ -6162,6 +6184,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } + unlock_page(page); put_page(page); } spin_unlock(ptl); @@ -6204,7 +6227,8 @@ retry: } if (!device) putback_lru_page(page); -put: /* get_mctgt_type() gets the page */ +put: /* get_mctgt_type() gets & locks the page */ + unlock_page(page); put_page(page); break; case MC_TARGET_SWAP: From c7c3dec1c9db9746912af2bbb5d6a2dd9f152d20 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Dec 2022 18:13:40 +0100 Subject: [PATCH 037/505] mm: rmap: remove lock_page_memcg() The previous patch made sure charge moving only touches pages for which page_mapped() is stable. lock_page_memcg() is no longer needed. Link: https://lkml.kernel.org/r/20221206171340.139790-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hugh Dickins Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/rmap.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index b616870a09be..32e48b1c5847 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1222,9 +1222,6 @@ void page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first = true; - if (unlikely(PageKsm(page))) - lock_page_memcg(page); - /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); @@ -1262,15 +1259,14 @@ void page_add_anon_rmap(struct page *page, if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - if (unlikely(PageKsm(page))) - unlock_page_memcg(page); - - /* address might be in next vma when migration races vma_adjust */ - else if (first) - __page_set_anon_rmap(page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - else - __page_check_anon_rmap(page, vma, address); + if (likely(!PageKsm(page))) { + /* address might be in next vma when migration races vma_adjust */ + if (first) + __page_set_anon_rmap(page, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + else + __page_check_anon_rmap(page, vma, address); + } mlock_vma_page(page, vma, compound); } @@ -1329,7 +1325,6 @@ void page_add_file_rmap(struct page *page, bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); - lock_page_memcg(page); /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { @@ -1365,7 +1360,6 @@ void page_add_file_rmap(struct page *page, NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); - unlock_page_memcg(page); mlock_vma_page(page, vma, compound); } @@ -1394,8 +1388,6 @@ void page_remove_rmap(struct page *page, return; } - lock_page_memcg(page); - /* Is page being unmapped by PTE? Is this its last map to be removed? */ if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); @@ -1451,8 +1443,6 @@ void page_remove_rmap(struct page *page, * and remember that it's only reliable while mapped. */ - unlock_page_memcg(page); - munlock_vma_page(page, vma, compound); } From da34a8484d162585e22ed8c1e4114aa2f60e3567 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 7 Dec 2022 14:00:39 +0100 Subject: [PATCH 038/505] mm: memcontrol: deprecate charge moving Charge moving mode in cgroup1 allows memory to follow tasks as they migrate between cgroups. This is, and always has been, a questionable thing to do - for several reasons. First, it's expensive. Pages need to be identified, locked and isolated from various MM operations, and reassigned, one by one. Second, it's unreliable. Once pages are charged to a cgroup, there isn't always a clear owner task anymore. Cache isn't moved at all, for example. Mapped memory is moved - but if trylocking or isolating a page fails, it's arbitrarily left behind. Frequent moving between domains may leave a task's memory scattered all over the place. Third, it isn't really needed. Launcher tasks can kick off workload tasks directly in their target cgroup. Using dedicated per-workload groups allows fine-grained policy adjustments - no need to move tasks and their physical pages between control domains. The feature was never forward-ported to cgroup2, and it hasn't been missed. Despite it being a niche usecase, the maintenance overhead of supporting it is enormous. Because pages are moved while they are live and subject to various MM operations, the synchronization rules are complicated. There are lock_page_memcg() in MM and FS code, which non-cgroup people don't understand. In some cases we've been able to shift code and cgroup API calls around such that we can rely on native locking as much as possible. But that's fragile, and sometimes we need to hold MM locks for longer than we otherwise would (pte lock e.g.). Mark the feature deprecated. Hopefully we can remove it soon. And backport into -stable kernels so that people who develop against earlier kernels are warned about this deprecation as early as possible. [akpm@linux-foundation.org: fix memory.rst underlining] Link: https://lkml.kernel.org/r/Y5COd+qXwk/S+n8N@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 13 +++++++++++-- mm/memcontrol.c | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 60370f2c67b9..258e45cc3b2d 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -86,6 +86,8 @@ Brief summary of control files. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) memory.move_charge_at_immigrate set/show controls of moving charges + This knob is deprecated and shouldn't be + used. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node @@ -717,8 +719,15 @@ NOTE2: It is recommended to set the soft limit always below the hard limit, otherwise the hard limit will take precedence. -8. Move charges at task migration -================================= +8. Move charges at task migration (DEPRECATED!) +=============================================== + +THIS IS DEPRECATED! + +It's expensive and unreliable! It's better practice to launch workload +tasks directly from inside their target cgroup. Use dedicated workload +cgroups to allow fine-grained policy adjustments without having to +move physical pages between control domains. Users can move charges associated with a task along with task migration, that is, uncharge task's pages from the old cgroup and charge them to the new cgroup. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a698a2b6523b..49f67176a1a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3919,6 +3919,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + if (val & ~MOVE_MASK) return -EINVAL; From 98def236f63c66629fb6b2d4b69cecffc5b46539 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:20 +0000 Subject: [PATCH 039/505] mm/damon/core: implement damos filter Patch series "implement DAMOS filtering for anon pages and/or specific memory cgroups" DAMOS let users do system operations in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. This patchset adds the support for all DAMOS actions that 'paddr' monitoring operations set supports ('pageout', 'lru_prio', and 'lru_deprio'), and the functionality is exposed via DAMON kernel API (damon.h) the DAMON sysfs interface (/sys/kernel/mm/damon/admins/), and DAMON_RECLAIM module parameters. Patches Sequence ---------------- First patch implements DAMOS filter interface to DAMON kernel API. Second patch makes the physical address space monitoring operations set to support the filters from all supporting DAMOS actions. Third patch adds anonymous pages filter support to DAMON_RECLAIM, and the fourth patch documents the DAMON_RECLAIM's new feature. Fifth to seventh patches implement DAMON sysfs files for support of the filters, and eighth patch connects the file to use DAMOS filters feature. Ninth patch adds simple self test cases for DAMOS filters of the sysfs interface. Finally, following two patches (tenth and eleventh) document the new features and interfaces. This patch (of 11): DAMOS lets users do system operation in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. Note that this commit adds only the interface to the DAMON kernel API. The impelmentation should be made in the monitoring operations sets, and following commits will add that. Link: https://lkml.kernel.org/r/20221205230830.144349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221205230830.144349-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 51 +++++++++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index ad15a5b88e3a..7907918ad2e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,6 +8,7 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include #include #include #include @@ -215,6 +216,39 @@ struct damos_stat { unsigned long qt_exceeds; }; +/** + * enum damos_filter_type - Type of memory for &struct damos_filter + * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @NR_DAMOS_FILTER_TYPES: Number of filter types. + */ +enum damos_filter_type { + DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_MEMCG, + NR_DAMOS_FILTER_TYPES, +}; + +/** + * struct damos_filter - DAMOS action target memory filter. + * @type: Type of the page. + * @matching: If the matching page should filtered out or in. + * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @list: List head for siblings. + * + * Before applying the &damos->action to a memory region, DAMOS checks if each + * page of the region matches to this and avoid applying the action if so. + * Note that the check support is up to &struct damon_operations + * implementation. + */ +struct damos_filter { + enum damos_filter_type type; + bool matching; + union { + unsigned short memcg_id; + }; + struct list_head list; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -239,6 +273,7 @@ struct damos_access_pattern { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -254,6 +289,10 @@ struct damos_access_pattern { * If all schemes that registered to a &struct damon_ctx are inactive, DAMON * stops monitoring and just repeatedly checks the watermarks. * + * Before applying the &action to a memory region, &struct damon_operations + * implementation could check pages of the region and skip &action to respect + * &filters + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -263,6 +302,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; + struct list_head filters; struct damos_stat stat; struct list_head list; }; @@ -516,6 +556,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->filters, list) + +#define damos_for_each_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -536,6 +582,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching); +void damos_add_filter(struct damos *s, struct damos_filter *f); +void damos_destroy_filter(struct damos_filter *f); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks); diff --git a/mm/damon/core.c b/mm/damon/core.c index ceec75b88ef9..1bf0654ae189 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching) +{ + struct damos_filter *filter; + + filter = kmalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return NULL; + filter->type = type; + filter->matching = matching; + return filter; +} + +void damos_add_filter(struct damos *s, struct damos_filter *f) +{ + list_add_tail(&f->list, &s->filters); +} + +static void damos_del_filter(struct damos_filter *f) +{ + list_del(&f->list); +} + +static void damos_free_filter(struct damos_filter *f) +{ + kfree(f); +} + +void damos_destroy_filter(struct damos_filter *f) +{ + damos_del_filter(f); + damos_free_filter(f); +} + /* initialize private fields of damos_quota and return the pointer */ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) { @@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return NULL; scheme->pattern = *pattern; scheme->action = action; + INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s) void damon_destroy_scheme(struct damos *s) { + struct damos_filter *f, *next; + + damos_for_each_filter_safe(f, next, s) + damos_destroy_filter(f); damon_del_scheme(s); damon_free_scheme(s); } From 18250e78f9c759010d7e0008af6e9c37aeddb1e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:21 +0000 Subject: [PATCH 040/505] mm/damon/paddr: support DAMOS filters Implement support of the DAMOS filters in the physical address space monitoring operations set, for all DAMOS actions that it supports including 'pageout', 'lru_prio', and 'lru_deprio'. Link: https://lkml.kernel.org/r/20221205230830.144349-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 71 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index e1a4315c4be6..ebd1905eed6f 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -202,7 +202,47 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static unsigned long damon_pa_pageout(struct damon_region *r) +static bool __damos_pa_filter_out(struct damos_filter *filter, + struct page *page) +{ + bool matched = false; + struct mem_cgroup *memcg; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ANON: + matched = PageAnon(page); + break; + case DAMOS_FILTER_TYPE_MEMCG: + rcu_read_lock(); + memcg = page_memcg_check(page); + if (!memcg) + matched = false; + else + matched = filter->memcg_id == mem_cgroup_id(memcg); + rcu_read_unlock(); + break; + default: + break; + } + + return matched == filter->matching; +} + +/* + * damos_pa_filter_out - Return true if the page should be filtered out. + */ +static bool damos_pa_filter_out(struct damos *scheme, struct page *page) +{ + struct damos_filter *filter; + + damos_for_each_filter(filter, scheme) { + if (__damos_pa_filter_out(filter, page)) + return true; + } + return false; +} + +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; LIST_HEAD(page_list); @@ -213,6 +253,11 @@ static unsigned long damon_pa_pageout(struct damon_region *r) if (!page) continue; + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + ClearPageReferenced(page); test_and_clear_page_young(page); if (isolate_lru_page(page)) { @@ -232,7 +277,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r) } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed) { unsigned long addr, applied = 0; @@ -241,6 +286,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (!page) continue; + + if (damos_pa_filter_out(s, page)) { + put_page(page); + continue; + } + if (mark_accessed) mark_page_accessed(page); else @@ -251,14 +302,16 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static unsigned long damon_pa_mark_accessed(struct damon_region *r, + struct damos *s) { - return damon_pa_mark_accessed_or_deactivate(r, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true); } -static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +static unsigned long damon_pa_deactivate_pages(struct damon_region *r, + struct damos *s) { - return damon_pa_mark_accessed_or_deactivate(r, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, @@ -267,11 +320,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r); + return damon_pa_pageout(r, scheme); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r); + return damon_pa_mark_accessed(r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r); + return damon_pa_deactivate_pages(r, scheme); case DAMOS_STAT: break; default: From 66d9faec0745f8db4bd9ef59a287627fc5ea691f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:22 +0000 Subject: [PATCH 041/505] mm/damon/reclaim: add a parameter called skip_anon for avoiding anonymous pages reclamation In some cases, for example if users have confidence at anonymous pages management or the swap device is too slow, users would want to avoid DAMON_RECLAIM swapping the anonymous pages out. For such case, add yet another DAMON_RECLAIM parameter, namely 'skip_anon'. When it is set as 'Y', DAMON_RECLAIM will avoid reclaiming anonymous pages using a DAMOS filter. Link: https://lkml.kernel.org/r/20221205230830.144349-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e82631f39481..648d2a85523a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -98,6 +98,15 @@ module_param(monitor_region_start, ulong, 0600); static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); +/* + * Skip anonymous pages reclamation. + * + * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous + * pages. By default, ``N``. + */ +static bool skip_anon __read_mostly; +module_param(skip_anon, bool, 0600); + /* * PID of the DAMON thread * @@ -142,6 +151,7 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { struct damos *scheme; + struct damos_filter *filter; int err = 0; err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); @@ -152,6 +162,15 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; + if (skip_anon) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + if (!filter) { + /* Will be freed by next 'damon_set_schemes()' below */ + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damos_add_filter(scheme, filter); + } damon_set_schemes(ctx, &scheme, 1); return damon_set_region_biggest_system_ram_default(target, From d56fe24237c3dec29b7de20ce93a4a53306d180b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:23 +0000 Subject: [PATCH 042/505] Docs/admin-guide/damon/reclaim: document 'skip_anon' parameter Document the newly added 'skip_anon' parameter of DAMON_RECLAIM, which can be used to avoid anonymous pages reclamation. Link: https://lkml.kernel.org/r/20221205230830.144349-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 4f1479a11e63..ff335e96e0d8 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -205,6 +205,15 @@ The end physical address of memory region that DAMON_RECLAIM will do work against. That is, DAMON_RECLAIM will find cold memory regions in this region and reclaims. By default, biggest System RAM is used as the region. +skip_anon +--------- + +Skip anonymous pages reclamation. + +If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous +pages. By default, ``N``. + + kdamond_pid ----------- From ac35264b9e8807f019f36e7dbc640b66fd43a865 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:24 +0000 Subject: [PATCH 043/505] mm/damon/sysfs-schemes: implement filters directory DAMOS filters are currently supported by only DAMON kernel API. To expose the feature to user space, implement a DAMON sysfs directory named 'filters' under each scheme directory. Please note that this is implementing only the directory. Following commits will implement more files and directories, and finally connect the DAMOS filters feature. Link: https://lkml.kernel.org/r/20221205230830.144349-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 85 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 81fc4d27f4e4..50c8148cb474 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -258,6 +258,63 @@ static struct kobj_type damon_sysfs_stats_ktype = { .default_groups = damon_sysfs_stats_groups, }; +/* + * filters directory + */ + +struct damon_sysfs_scheme_filters { + struct kobject kobj; + int nr; +}; + +static struct damon_sysfs_scheme_filters * +damon_sysfs_scheme_filters_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); +} + +static ssize_t nr_filters_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filters *filters = container_of(kobj, + struct damon_sysfs_scheme_filters, kobj); + + return sysfs_emit(buf, "%d\n", filters->nr); +} + +static ssize_t nr_filters_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + return count; +} + +static void damon_sysfs_scheme_filters_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr = + __ATTR_RW_MODE(nr_filters, 0600); + +static struct attribute *damon_sysfs_scheme_filters_attrs[] = { + &damon_sysfs_scheme_filters_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters); + +static struct kobj_type damon_sysfs_scheme_filters_ktype = { + .release = damon_sysfs_scheme_filters_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filters_groups, +}; + /* * watermarks directory */ @@ -784,6 +841,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_access_pattern *access_pattern; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_scheme_filters *filters; struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; }; @@ -878,6 +936,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) return err; } +static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_filters *filters = + damon_sysfs_scheme_filters_alloc(); + int err; + + if (!filters) + return -ENOMEM; + err = kobject_init_and_add(&filters->kobj, + &damon_sysfs_scheme_filters_ktype, &scheme->kobj, + "filters"); + if (err) + kobject_put(&filters->kobj); + else + scheme->filters = filters; + return err; +} + static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) { struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); @@ -926,9 +1002,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_watermarks(scheme); if (err) goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); + err = damon_sysfs_scheme_set_filters(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_filters_watermarks_quotas_access_pattern_out; err = damon_sysfs_scheme_set_tried_regions(scheme); if (err) goto put_tried_regions_out; @@ -937,6 +1016,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) put_tried_regions_out: kobject_put(&scheme->tried_regions->kobj); scheme->tried_regions = NULL; +put_filters_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->filters->kobj); + scheme->filters = NULL; put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; @@ -956,6 +1038,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->filters->kobj); kobject_put(&scheme->stats->kobj); damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); kobject_put(&scheme->tried_regions->kobj); From 7ee161f18b5da5170b5d6a51aace49d312099128 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:25 +0000 Subject: [PATCH 044/505] mm/damon/sysfs-schemes: implement filter directory Implement DAMOS filter directory which will be located under the filters directory. The directory provides three files, namely type, matching, and memcg_path. 'type' and 'matching' will be directly connected to the fields of 'struct damos_filter' having same name. 'memcg_path' will receive the path of the memory cgroup of the interest and later converted to memcg id when it's committed. Link: https://lkml.kernel.org/r/20221205230830.144349-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 128 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 50c8148cb474..afbfc55a8e84 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -258,6 +258,134 @@ static struct kobj_type damon_sysfs_stats_ktype = { .default_groups = damon_sysfs_stats_groups, }; +/* + * filter directory + */ + +struct damon_sysfs_scheme_filter { + struct kobject kobj; + enum damos_filter_type type; + bool matching; + char *memcg_path; +}; + +/* Should match with enum damos_filter_type */ +static const char * const damon_sysfs_scheme_filter_type_strs[] = { + "anon", + "memcg", +}; + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_scheme_filter_type_strs[filter->type]); +} + +static ssize_t type_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + enum damos_filter_type type; + ssize_t ret = -EINVAL; + + for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) { + if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[ + type])) { + filter->type = type; + ret = count; + break; + } + } + return ret; +} + +static ssize_t matching_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N'); +} + +static ssize_t matching_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool matching; + int err = kstrtobool(buf, &matching); + + if (err) + return err; + + filter->matching = matching; + return count; +} + +static ssize_t memcg_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%s\n", + filter->memcg_path ? filter->memcg_path : ""); +} + +static ssize_t memcg_path_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL); + + if (!path) + return -ENOMEM; + + strncpy(path, buf, count); + path[count] = '\0'; + filter->memcg_path = path; + return count; +} + +static void damon_sysfs_scheme_filter_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + kfree(filter->memcg_path); + kfree(filter); +} + +static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = + __ATTR_RW_MODE(type, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = + __ATTR_RW_MODE(matching, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = + __ATTR_RW_MODE(memcg_path, 0600); + +static struct attribute *damon_sysfs_scheme_filter_attrs[] = { + &damon_sysfs_scheme_filter_type_attr.attr, + &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_memcg_path_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); + +static struct kobj_type damon_sysfs_scheme_filter_ktype = { + .release = damon_sysfs_scheme_filter_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_filter_groups, +}; + /* * filters directory */ From 472e2b70eda6a1bccb62f391808924a59c49e22f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:26 +0000 Subject: [PATCH 045/505] mm/damon/sysfs-schemes: connect filter directory and filters directory Implement 'nr_filters' file under 'filters' directory, which will be used to populate specific number of 'filter' directory under the directory, similar to other 'nr_*' files in DAMON sysfs interface. Link: https://lkml.kernel.org/r/20221205230830.144349-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 68 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index afbfc55a8e84..e79c678a69d5 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -269,6 +269,11 @@ struct damon_sysfs_scheme_filter { char *memcg_path; }; +static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL); +} + /* Should match with enum damos_filter_type */ static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", @@ -392,6 +397,7 @@ static struct kobj_type damon_sysfs_scheme_filter_ktype = { struct damon_sysfs_scheme_filters { struct kobject kobj; + struct damon_sysfs_scheme_filter **filters_arr; int nr; }; @@ -401,6 +407,57 @@ damon_sysfs_scheme_filters_alloc(void) return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL); } +static void damon_sysfs_scheme_filters_rm_dirs( + struct damon_sysfs_scheme_filters *filters) +{ + struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr; + int i; + + for (i = 0; i < filters->nr; i++) + kobject_put(&filters_arr[i]->kobj); + filters->nr = 0; + kfree(filters_arr); + filters->filters_arr = NULL; +} + +static int damon_sysfs_scheme_filters_add_dirs( + struct damon_sysfs_scheme_filters *filters, int nr_filters) +{ + struct damon_sysfs_scheme_filter **filters_arr, *filter; + int err, i; + + damon_sysfs_scheme_filters_rm_dirs(filters); + if (!nr_filters) + return 0; + + filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!filters_arr) + return -ENOMEM; + filters->filters_arr = filters_arr; + + for (i = 0; i < nr_filters; i++) { + filter = damon_sysfs_scheme_filter_alloc(); + if (!filter) { + damon_sysfs_scheme_filters_rm_dirs(filters); + return -ENOMEM; + } + + err = kobject_init_and_add(&filter->kobj, + &damon_sysfs_scheme_filter_ktype, + &filters->kobj, "%d", i); + if (err) { + kobject_put(&filter->kobj); + damon_sysfs_scheme_filters_rm_dirs(filters); + return err; + } + + filters_arr[i] = filter; + filters->nr++; + } + return 0; +} + static ssize_t nr_filters_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -413,6 +470,7 @@ static ssize_t nr_filters_show(struct kobject *kobj, static ssize_t nr_filters_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct damon_sysfs_scheme_filters *filters; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -420,6 +478,15 @@ static ssize_t nr_filters_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_scheme_filters_add_dirs(filters, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; } @@ -1166,6 +1233,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); + damon_sysfs_scheme_filters_rm_dirs(scheme->filters); kobject_put(&scheme->filters->kobj); kobject_put(&scheme->stats->kobj); damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); From 29cbb9a13f05b20f0777c60db9603730b487a4e0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:27 +0000 Subject: [PATCH 046/505] mm/damon/sysfs-schemes: implement scheme filters Implement scheme filters functionality of DAMON sysfs interface by making the code reads the values of files under the filter directories and pass that to DAMON using DAMON kernel API. [sj@kernel.org: fix leaking a filter for wrong cgroup path] Link: https://lkml.kernel.org/r/20221219171807.55708-2-sj@kernel.org [sj@kernel.org: return an error for filter memcg path id lookup failure] Link: https://lkml.kernel.org/r/20221219171807.55708-3-sj@kernel.org Link: https://lkml.kernel.org/r/20221205230830.144349-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 93 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index e79c678a69d5..f0dabe3e2dc0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1403,6 +1403,79 @@ struct kobj_type damon_sysfs_schemes_ktype = { .default_groups = damon_sysfs_schemes_groups, }; +static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, + char *memcg_path_buf, char *path) +{ +#ifdef CONFIG_MEMCG + cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX); + if (sysfs_streq(memcg_path_buf, path)) + return true; +#endif /* CONFIG_MEMCG */ + return false; +} + +static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +{ + struct mem_cgroup *memcg; + char *path; + bool found = false; + + if (!memcg_path) + return -EINVAL; + + path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + + for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; + memcg = mem_cgroup_iter(NULL, memcg, NULL)) { + /* skip removed memcg */ + if (!mem_cgroup_id(memcg)) + continue; + if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { + *id = mem_cgroup_id(memcg); + found = true; + break; + } + } + + kfree(path); + return found ? 0 : -EINVAL; +} + +static int damon_sysfs_set_scheme_filters(struct damos *scheme, + struct damon_sysfs_scheme_filters *sysfs_filters) +{ + int i; + struct damos_filter *filter, *next; + + damos_for_each_filter_safe(filter, next, scheme) + damos_destroy_filter(filter); + + for (i = 0; i < sysfs_filters->nr; i++) { + struct damon_sysfs_scheme_filter *sysfs_filter = + sysfs_filters->filters_arr[i]; + struct damos_filter *filter = + damos_new_filter(sysfs_filter->type, + sysfs_filter->matching); + int err; + + if (!filter) + return -ENOMEM; + if (filter->type == DAMOS_FILTER_TYPE_MEMCG) { + err = damon_sysfs_memcg_path_to_id( + sysfs_filter->memcg_path, + &filter->memcg_id); + if (err) { + damos_destroy_filter(filter); + return err; + } + } + damos_add_filter(scheme, filter); + } + return 0; +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { @@ -1411,6 +1484,10 @@ static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + struct damon_sysfs_scheme_filters *sysfs_filters = + sysfs_scheme->filters; + struct damos *scheme; + int err; struct damos_access_pattern pattern = { .min_sz_region = access_pattern->sz->min, @@ -1436,8 +1513,17 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, "a, &wmarks); + if (!scheme) + return NULL; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + return scheme; } static void damon_sysfs_update_scheme(struct damos *scheme, @@ -1448,6 +1534,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + int err; scheme->pattern.min_sz_region = access_pattern->sz->min; scheme->pattern.max_sz_region = access_pattern->sz->max; @@ -1470,6 +1557,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->wmarks.high = sysfs_wmarks->high; scheme->wmarks.mid = sysfs_wmarks->mid; scheme->wmarks.low = sysfs_wmarks->low; + + err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters); + if (err) + damon_destroy_scheme(scheme); } int damon_sysfs_set_schemes(struct damon_ctx *ctx, From 553b014244298d9f807286d6a71d722bc1f50f84 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:28 +0000 Subject: [PATCH 047/505] selftests/damon/sysfs: test filters directory Add simple test cases for scheme filters of DAMON sysfs interface. The test cases check if the files are populated as expected, receives some valid inputs, and refuses some invalid inputs. Link: https://lkml.kernel.org/r/20221205230830.144349-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index db4942383a50..a00336ffdcad 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -96,6 +96,34 @@ test_stats() done } +test_filter() +{ + filter_dir=$1 + ensure_file "$filter_dir/type" "exist" "600" + ensure_write_succ "$filter_dir/type" "anon" "valid input" + ensure_write_succ "$filter_dir/type" "memcg" "valid input" + ensure_write_fail "$filter_dir/type" "foo" "invalid input" + ensure_file "$filter_dir/matching" "exist" "600" + ensure_file "$filter_dir/memcg_path" "exist" "600" +} + +test_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_filter "$filters_dir/0" + test_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + test_watermarks() { watermarks_dir=$1 @@ -143,6 +171,7 @@ test_scheme() test_access_pattern "$scheme_dir/access_pattern" test_quotas "$scheme_dir/quotas" test_watermarks "$scheme_dir/watermarks" + test_filters "$scheme_dir/filters" test_stats "$scheme_dir/stats" test_tried_regions "$scheme_dir/tried_regions" } From 9b7f9322a5300505fffba492b45621c897c0e0df Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:29 +0000 Subject: [PATCH 048/505] Docs/admin-guide/mm/damon/usage: document DAMOS filters of sysfs Document about the newly added files for DAMOS filters on the DAMON usage document. Link: https://lkml.kernel.org/r/20221205230830.144349-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 48 +++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1a5b6b71efa1..3d82ca6a17ff 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -87,6 +87,8 @@ comma (","). :: │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low + │ │ │ │ │ │ │ filters/nr_filters + │ │ │ │ │ │ │ │ 0/type,matching,memcg_id │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds │ │ │ │ │ │ │ tried_regions/ │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age @@ -151,6 +153,8 @@ number (``N``) to the file creates the number of child directories named as moment, only one context per kdamond is supported, so only ``0`` or ``1`` can be written to the file. +.. _sysfs_contexts: + contexts// ------------- @@ -268,8 +272,8 @@ schemes// ------------ In each scheme directory, five directories (``access_pattern``, ``quotas``, -``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``) -exist. +``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file +(``action``) exist. The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords @@ -347,6 +351,46 @@ as below. The ``interval`` should written in microseconds unit. +schemes//filters/ +-------------------- + +Users could know something more than the kernel for specific types of memory. +In the case, users could do their own management for the memory and hence +doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access +pattern of the scheme and/or the monitoring regions for the purpose, but that +can be inefficient in some cases. In such cases, users could set non-access +pattern driven filters using files in this directory. + +In the beginning, this directory has only one file, ``nr_filters``. Writing a +number (``N``) to the file creates the number of child directories named ``0`` +to ``N-1``. Each directory represents each filter. The filters are evaluated +in the numeric order. + +Each filter directory contains three files, namely ``type``, ``matcing``, and +``memcg_path``. You can write one of two special keywords, ``anon`` for +anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of +the memory cgroup filtering, you can specify the memory cgroup of the interest +by writing the path of the memory cgroup from the cgroups mount point to +``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to +filter out pages that does or does not match to the type, respectively. Then, +the scheme's action will not be applied to the pages that specified to be +filtered out. + +For example, below restricts a DAMOS action to be applied to only non-anonymous +pages of all memory cgroups except ``/having_care_already``.:: + + # echo 2 > nr_filters + # # filter out anonymous pages + echo anon > 0/type + echo Y > 0/matching + # # further filter out all cgroups except one at '/having_care_already' + echo memcg > 1/type + echo /having_care_already > 1/memcg_path + echo N > 1/matching + +Note that filters could be ignored depend on the running DAMON operations set +`implementation `. + .. _sysfs_schemes_stats: schemes//stats/ From 497b099d9a166f819e667f4bf258e7a00ea2e78a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:30 +0000 Subject: [PATCH 049/505] Docs/ABI/damon: document scheme filters files Document newly added DAMON sysfs interface files for DAMOS filtering on the DAMON ABI document. Link: https://lkml.kernel.org/r/20221205230830.144349-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 13397b853692..2744f21b5a6b 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -258,6 +258,35 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the low watermark of the scheme in permil. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters/nr_filters +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing a number 'N' to this file creates the number of + directories for setting filters of the scheme named '0' to + 'N-1' under the filters/ directory. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//type +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the type of + the memory of the interest. 'anon' for anonymous pages, or + 'memcg' for specific memory cgroup can be written and read. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path +Date: Dec 2022 +Contact: SeongJae Park +Description: If 'memcg' is written to the 'type' file, writing to and + reading from this file sets and gets the path to the memory + cgroup of the interest. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching +Date: Dec 2022 +Contact: SeongJae Park +Description: Writing 'Y' or 'N' to this file sets whether to filter out + pages that do or do not match to the 'type' and 'memcg_path', + respectively. Filter out means the action of the scheme will + not be applied to. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//stats/nr_tried Date: Mar 2022 Contact: SeongJae Park From 3f79b187ad2f677047a1f1104bfb8a81b58b67d5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:37 +0800 Subject: [PATCH 050/505] swapfile: get rid of volatile and avoid redundant read Patch series "Clean up and fixes for swap", v2. This series cleans up some code paths, saves a few cycles and reduces the object size by a bit. It also fixes some rare race issue with statistics. This patch (of 4): Convert a volatile variable to more readable READ_ONCE. And this actually avoids the code from reading the variable twice redundantly when it races. Link: https://lkml.kernel.org/r/20221219185840.25441-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20221219185840.25441-2-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..6d3f60bd383b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1835,13 +1835,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; struct swap_info_struct *si; int ret = 0; - volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; + unsigned char swp_count; if (!is_swap_pte(*pte)) continue; @@ -1852,7 +1852,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); - swap_map = &si->swap_map[offset]; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; @@ -1869,8 +1868,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, folio = page_folio(page); } if (!folio) { - if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + swp_count = READ_ONCE(si->swap_map[offset]); + if (swp_count == 0 || swp_count == SWAP_MAP_BAD) goto try_next; + return -ENOMEM; } From 18ad72f5b7186336646182b17e0654bf907b39e6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:38 +0800 Subject: [PATCH 051/505] swap: avoid a redundant pte map if ra window is 1 Avoid a redundant pte map/unmap when swap readahead window is 1. Link: https://lkml.kernel.org/r/20221219185840.25441-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swap_state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 2927507b43d8..af8bc123b7c4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -727,8 +727,6 @@ static void swap_ra_info(struct vm_fault *vmf, } faddr = vmf->address; - orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); @@ -739,12 +737,11 @@ static void swap_ra_info(struct vm_fault *vmf, atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) { - pte_unmap(orig_pte); + if (win == 1) return; - } /* Copy the PTEs because the page table may be unmapped */ + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); if (fpfn == pfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); else if (pfn == fpfn + 1) From 16ba391e9c6bc26e6f0c950f2117f57b8e542d71 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:39 +0800 Subject: [PATCH 052/505] swap: fold swap_ra_clamp_pfn into swap_ra_info This makes the code cleaner. This helper is made of only two line of self explanational code and not reused anywhere else. And this actually make the compiled object smaller by a bit. bloat-o-meter results on x86_64 of mm/swap_state.o: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-35 (-35) Function old new delta swap_ra_info.constprop 512 477 -35 Total: Before=8388, After=8353, chg -0.42% Link: https://lkml.kernel.org/r/20221219185840.25441-4-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/swap_state.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index af8bc123b7c4..d8d171195a3a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -693,28 +693,15 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } -static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, - unsigned long faddr, - unsigned long lpfn, - unsigned long rpfn, - unsigned long *start, - unsigned long *end) -{ - *start = max3(lpfn, PFN_DOWN(vma->vm_start), - PFN_DOWN(faddr & PMD_MASK)); - *end = min3(rpfn, PFN_DOWN(vma->vm_end), - PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); -} - static void swap_ra_info(struct vm_fault *vmf, - struct vma_swap_readahead *ra_info) + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - unsigned long faddr, pfn, fpfn; + unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; pte_t *pte, *orig_pte; - unsigned int max_win, hits, prev_win, win, left; + unsigned int max_win, hits, prev_win, win; #ifndef CONFIG_64BIT pte_t *tpte; #endif @@ -742,16 +729,23 @@ static void swap_ra_info(struct vm_fault *vmf, /* Copy the PTEs because the page table may be unmapped */ orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - if (fpfn == pfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); - else if (pfn == fpfn + 1) - swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, - &start, &end); - else { - left = (win - 1) / 2; - swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, - &start, &end); + if (fpfn == pfn + 1) { + lpfn = fpfn; + rpfn = fpfn + win; + } else if (pfn == fpfn + 1) { + lpfn = fpfn - win + 1; + rpfn = fpfn + 1; + } else { + unsigned int left = (win - 1) / 2; + + lpfn = fpfn - left; + rpfn = fpfn + win - left; } + start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); + ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; pte -= ra_info->offset; From cbc2bd98db85504074c1b94175e5e136e457dc0b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Dec 2022 02:58:40 +0800 Subject: [PATCH 053/505] swap: avoid holding swap reference in swap_cache_get_folio All its callers either already hold a reference to, or lock the swap device while calling this function. There is only one exception in shmem_swapin_folio, just make this caller also hold a reference of the swap device, so this helper can be simplified and saves a few cycles. This also provides finer control of error handling in shmem_swapin_folio, on race (with swap off), it can just try again. For invalid swap entry, it can fail with a proper error code. Link: https://lkml.kernel.org/r/20221219185840.25441-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/shmem.c | 11 +++++++++++ mm/swap_state.c | 8 ++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d3f0c94f6836..bc5c156ef470 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1739,6 +1739,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; + struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1750,6 +1751,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (is_swapin_error_entry(swap)) return -EIO; + si = get_swap_device(swap); + if (!si) { + if (!shmem_confirm_swap(mapping, index, swap)) + return -EEXIST; + else + return -EINVAL; + } + /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { @@ -1810,6 +1819,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, delete_from_swap_cache(folio); folio_mark_dirty(folio); swap_free(swap); + put_swap_device(si); *foliop = folio; return 0; @@ -1823,6 +1833,7 @@ unlock: folio_unlock(folio); folio_put(folio); } + put_swap_device(si); return error; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d8d171195a3a..cb9aaa00951d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -321,19 +321,15 @@ static inline bool swap_use_vma_readahead(void) * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the folio * lock before returning. + * + * Caller must lock the swap device or hold a reference to keep it valid. */ struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio; - struct swap_info_struct *si; - si = get_swap_device(entry); - if (!si) - return NULL; folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - put_swap_device(si); - if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; From 44383cef54c0ce1201f884d83cc2b367bc5aa4f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 19 Dec 2022 19:09:18 +0100 Subject: [PATCH 054/505] kasan: allow sampling page_alloc allocations for HW_TAGS As Hardware Tag-Based KASAN is intended to be used in production, its performance impact is crucial. As page_alloc allocations tend to be big, tagging and checking all such allocations can introduce a significant slowdown. Add two new boot parameters that allow to alleviate that slowdown: - kasan.page_alloc.sample, which makes Hardware Tag-Based KASAN tag only every Nth page_alloc allocation with the order configured by the second added parameter (default: tag every such allocation). - kasan.page_alloc.sample.order, which makes sampling enabled by the first parameter only affect page_alloc allocations with the order equal or greater than the specified value (default: 3, see below). The exact performance improvement caused by using the new parameters depends on their values and the applied workload. The chosen default value for kasan.page_alloc.sample.order is 3, which matches both PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. This is done for two reasons: 1. PAGE_ALLOC_COSTLY_ORDER is "the order at which allocations are deemed costly to service", which corresponds to the idea that only large and thus costly allocations are supposed to sampled. 2. One of the workloads targeted by this patch is a benchmark that sends a large amount of data over a local loopback connection. Most multi-page data allocations in the networking subsystem have the order of SKB_FRAG_PAGE_ORDER (or PAGE_ALLOC_COSTLY_ORDER). When running a local loopback test on a testing MTE-enabled device in sync mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown. Applying this patch and setting kasan.page_alloc.sampling to a value higher than 1 allows to lower the slowdown. The performance improvement saturates around the sampling interval value of 10 with the default sampling page order of 3. This lowers the slowdown to ~20%. The slowdown in real scenarios involving the network will likely be better. Enabling page_alloc sampling has a downside: KASAN misses bad accesses to a page_alloc allocation that has not been tagged. This lowers the value of KASAN as a security mitigation. However, based on measuring the number of page_alloc allocations of different orders during boot in a test build, sampling with the default kasan.page_alloc.sample.order value affects only ~7% of allocations. The rest ~93% of allocations are still checked deterministically. Link: https://lkml.kernel.org/r/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Jann Horn Cc: Mark Brand Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 17 +++++++++ include/linux/kasan.h | 14 +++++--- mm/kasan/common.c | 9 +++-- mm/kasan/hw_tags.c | 60 +++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 27 ++++++++++++++ mm/page_alloc.c | 43 ++++++++++++++-------- 6 files changed, 149 insertions(+), 21 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 5c93ab915049..e66916a483cd 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -140,6 +140,23 @@ disabling KASAN altogether or controlling its features: - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). +- ``kasan.page_alloc.sample=`` makes KASAN tag only every + Nth page_alloc allocation with the order equal or greater than + ``kasan.page_alloc.sample.order``, where N is the value of the ``sample`` + parameter (default: ``1``, or tag every such allocation). + This parameter is intended to mitigate the performance overhead introduced + by KASAN. + Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks + of allocations chosen by sampling and thus miss bad accesses to these + allocations. Use the default value for accurate bug detection. + +- ``kasan.page_alloc.sample.order=`` specifies the minimum + order of allocations that are affected by sampling (default: ``3``). + Only applies when ``kasan.page_alloc.sample`` is set to a value greater + than ``1``. + This parameter is intended to allow sampling only large page_alloc + allocations, which is the biggest source of the performance overhead. + Error reports ~~~~~~~~~~~~~ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 96c9d56e5510..5ebbaf672009 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -120,12 +120,13 @@ static __always_inline void kasan_poison_pages(struct page *page, __kasan_poison_pages(page, order, init); } -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); -static __always_inline void kasan_unpoison_pages(struct page *page, +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); +static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) - __kasan_unpoison_pages(page, order, init); + return __kasan_unpoison_pages(page, order, init); + return false; } void __kasan_cache_create_kmalloc(struct kmem_cache *cache); @@ -249,8 +250,11 @@ static __always_inline bool kasan_check_byte(const void *addr) static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_unpoison_pages(struct page *page, unsigned int order, - bool init) {} +static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, + bool init) +{ + return false; +} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 833bf2cfd2a3..1d0008e1c420 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; if (unlikely(PageHighMem(page))) - return; + return false; + + if (!kasan_sample_page_alloc(order)) + return false; tag = kasan_random_tag(); kasan_unpoison(set_tag(page_address(page), tag), PAGE_SIZE << order, init); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); + + return true; } void __kasan_poison_pages(struct page *page, unsigned int order, bool init) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index b22c4f461cb0..d1bcb0205327 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); +#define PAGE_ALLOC_SAMPLE_DEFAULT 1 +#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3 + +/* + * Sampling interval of page_alloc allocation (un)poisoning. + * Defaults to no sampling. + */ +unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + +/* + * Minimum order of page_alloc allocations to be affected by sampling. + * The default value is chosen to match both + * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. + */ +unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + +DEFINE_PER_CPU(long, kasan_page_alloc_skip); + /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void) return "sync"; } +/* kasan.page_alloc.sample= */ +static int __init early_kasan_flag_page_alloc_sample(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtoul(arg, 0, &kasan_page_alloc_sample); + if (rv) + return rv; + + if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) { + kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample); + +/* kasan.page_alloc.sample.order= */ +static int __init early_kasan_flag_page_alloc_sample_order(char *arg) +{ + int rv; + + if (!arg) + return -EINVAL; + + rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order); + if (rv) + return rv; + + if (kasan_page_alloc_sample_order > INT_MAX) { + kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT; + return -EINVAL; + } + + return 0; +} +early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order); + /* * kasan_init_hw_tags_cpu() is called for each CPU. * Not marked as __init as a CPU can be hot-plugged after boot. diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ea8cf1310b1e..32413f22aa82 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -42,6 +42,10 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +extern unsigned long kasan_page_alloc_sample; +extern unsigned int kasan_page_alloc_sample_order; +DECLARE_PER_CPU(long, kasan_page_alloc_skip); + static inline bool kasan_vmalloc_enabled(void) { return static_branch_likely(&kasan_flag_vmalloc); @@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + /* Fast-path for when sampling is disabled. */ + if (kasan_page_alloc_sample == 1) + return true; + + if (order < kasan_page_alloc_sample_order) + return true; + + if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) { + this_cpu_write(kasan_page_alloc_skip, + kasan_page_alloc_sample - 1); + return true; + } + + return false; +} + #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_async_fault_possible(void) @@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void) return true; } +static inline bool kasan_sample_page_alloc(unsigned int order) +{ + return true; +} + #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0745aedebb37..7d980dc0000e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1356,6 +1356,8 @@ out: * see the comment next to it. * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, * see the comment next to it. + * 4. The allocation is excluded from being checked due to sampling, + * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages @@ -2468,7 +2470,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool reset_tags = !zero_tags; int i; set_page_private(page, 0); @@ -2491,30 +2494,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed (which happens only when memory - * should be initialized as well). + * If memory tags should be zeroed + * (which happens only when memory should be initialized as well). */ - if (init_tags) { + if (zero_tags) { /* Initialize both memory and tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); - /* Note that memory is already initialized by the loop above. */ + /* Take note that memory was initialized by the loop above. */ init = false; } if (!should_skip_kasan_unpoison(gfp_flags)) { - /* Unpoison shadow memory or set memory tags. */ - kasan_unpoison_pages(page, order, init); - - /* Note that memory is already initialized by KASAN. */ - if (kasan_has_integrated_init()) - init = false; - } else { - /* Ensure page_address() dereferencing does not fault. */ + /* Try unpoisoning (or setting tags) and initializing memory. */ + if (kasan_unpoison_pages(page, order, init)) { + /* Take note that memory was initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + /* Take note that memory tags were set by KASAN. */ + reset_tags = false; + } else { + /* + * KASAN decided to exclude this allocation from being + * poisoned due to sampling. Skip poisoning as well. + */ + SetPageSkipKASanPoison(page); + } + } + /* + * If memory tags have not been set, reset the page tags to ensure + * page_address() dereferencing does not fault. + */ + if (reset_tags) { for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); } - /* If memory is still not initialized, do it now. */ + /* If memory is still not initialized, initialize it now. */ if (init) kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ From ef1faf0e370a8e33fe625088ddc5fde02cf8c4c4 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Mon, 19 Dec 2022 16:49:17 +0000 Subject: [PATCH 055/505] tools/vm/page_owner_sort: free memory before exit Although when a process terminates, the kernel will removes memory associated with that process, It's neither good style nor proper design to leave it to kernel. This patch free allocated memory before process exit. Link: https://lkml.kernel.org/r/20221219164917.14132-1-iecedge@gmail.com Signed-off-by: Jianlin Lv Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 65 ++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index ce860ab94162..7c2ac124cdc8 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -246,15 +246,16 @@ static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) return 0; } -static void check_regcomp(regex_t *pattern, const char *regex) +static bool check_regcomp(regex_t *pattern, const char *regex) { int err; err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); if (err != 0 || pattern->re_nsub != 1) { fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); - exit(1); + return false; } + return true; } static char **explode(char sep, const char *str, int *size) @@ -494,28 +495,28 @@ static bool is_need(char *buf) return true; } -static void add_list(char *buf, int len, char *ext_buf) +static bool add_list(char *buf, int len, char *ext_buf) { if (list_size != 0 && len == list[list_size-1].len && memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; list[list_size-1].page_num += get_page_num(buf); - return; + return true; } if (list_size == max_size) { fprintf(stderr, "max_size too small??\n"); - exit(1); + return false; } if (!is_need(buf)) - return; + return true; list[list_size].pid = get_pid(buf); list[list_size].tgid = get_tgid(buf); list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); if (!list[list_size].txt) { fprintf(stderr, "Out of memory\n"); - exit(1); + return false; } memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; @@ -534,6 +535,7 @@ static void add_list(char *buf, int len, char *ext_buf) printf("loaded %d\r", list_size); fflush(stdout); } + return true; } static bool parse_cull_args(const char *arg_str) @@ -790,12 +792,19 @@ int main(int argc, char **argv) exit(1); } - check_regcomp(&order_pattern, "order\\s*([0-9]*),"); - check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); - check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) "); - check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts"); - check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); - check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -804,7 +813,7 @@ int main(int argc, char **argv) ext_buf = malloc(BUF_SIZE); if (!list || !buf || !ext_buf) { fprintf(stderr, "Out of memory\n"); - exit(1); + goto out_free; } for ( ; ; ) { @@ -812,7 +821,8 @@ int main(int argc, char **argv) if (buf_len < 0) break; - add_list(buf, buf_len, ext_buf); + if (!add_list(buf, buf_len, ext_buf)) + goto out_free; } printf("loaded %d\n", list_size); @@ -862,11 +872,26 @@ int main(int argc, char **argv) fprintf(fout, "\n"); } } - regfree(&order_pattern); - regfree(&pid_pattern); - regfree(&tgid_pattern); - regfree(&comm_pattern); - regfree(&ts_nsec_pattern); + +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_free_ts: regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); + return 0; } From 80b1d8fdfad1f3084450afa6e2efcdcce867d4af Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 19 Dec 2022 12:36:59 +0000 Subject: [PATCH 056/505] mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node() This function sets __GFP_NOWARN in the gfp_mask rendering the warn_alloc() invocations no-ops. Remove this and instead rely on this flag being set only for the vm_area_alloc_pages() function, ensuring it is cleared for each of the warn_alloc() calls. Link: https://lkml.kernel.org/r/20221219123659.90614-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..10fe83c24436 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3031,7 +3031,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); - gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; From 831978e37e93bd3e36612917a4b193278950daff Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:52 +0800 Subject: [PATCH 057/505] maple_tree: remove extra space and blank line Patch series "Clean up and refinement for maple tree", v2. This patchset cleans up and refines some maple tree code. A few small changes make the code easier to understand and for better readability. This patch (of 7): These extra space and blank lines are unnecessary, so drop them. Link: https://lkml.kernel.org/r/20221221060058.609003-1-vernon2gm@gmail.com Link: https://lkml.kernel.org/r/20221221060058.609003-2-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 -- lib/maple_tree.c | 14 ++++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..4ee5a969441c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -517,7 +517,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +638,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..975358bec754 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -183,7 +183,6 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } - static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; @@ -468,7 +467,7 @@ static inline void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { - unsigned long val = (unsigned long) parent; + unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); @@ -502,7 +501,7 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, */ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { - unsigned long val = (unsigned long) mte_to_node(enode)->parent; + unsigned long val = (unsigned long)mte_to_node(enode)->parent; /* Root. */ if (val & 1) @@ -1278,7 +1277,6 @@ nomem_one: mas->alloc->total = success; mas_set_err(mas, -ENOMEM); return; - } /* @@ -2946,7 +2944,7 @@ next: mas->min = prev_min; mas->max = prev_max; mas->node = last; - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -3466,7 +3464,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { - struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; @@ -3892,7 +3889,7 @@ next: goto dead_node; } while (!ma_is_leaf(type)); - return (void *) next; + return (void *)next; dead_node: mas_reset(mas); @@ -4710,7 +4707,6 @@ found: static inline void mas_rewalk(struct ma_state *mas, unsigned long index) { - retry: mas_set(mas, index); mas_state_walk(mas); @@ -4718,7 +4714,6 @@ retry: goto retry; return; - } /* @@ -5620,7 +5615,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) mas_reset(wr_mas->mas); } } - } /* Interface */ From d56c593c8e128c42dc81707c07cbd5af41862214 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:53 +0800 Subject: [PATCH 058/505] maple_tree: remove extra return statement For functions with a return type of void, it is unnecessary to add a reurn statement at the end of the function, so drop it. Link: https://lkml.kernel.org/r/20221221060058.609003-3-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 975358bec754..fc70ae9850b1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1276,7 +1276,6 @@ nomem_one: if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) mas->alloc->total = success; mas_set_err(mas, -ENOMEM); - return; } /* @@ -4712,8 +4711,6 @@ retry: mas_state_walk(mas); if (mas_is_start(mas)) goto retry; - - return; } /* From bd592703b81a95473f6a01fe731beccd0992236e Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:54 +0800 Subject: [PATCH 059/505] maple_tree: use mt_node_max() instead of direct operations mt_max[] Use mt_node_max() to get the maximum number of slots for a node, rather than direct operations mt_max[], makes it better portability. Link: https://lkml.kernel.org/r/20221221060058.609003-4-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fc70ae9850b1..506b8fdb5f1b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6725,7 +6725,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; - else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) + else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; @@ -6832,7 +6832,7 @@ void mt_dump(const struct maple_tree *mt) if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0); else if (entry) - mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); + mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); } EXPORT_SYMBOL_GPL(mt_dump); From 84fd3e1ee395649ac45b7317d44c10b33d0dca79 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:55 +0800 Subject: [PATCH 060/505] maple_tree: use macro MA_ROOT_PARENT instead of number When you need to compare whether node->parent is parent of the root node, using macro MA_ROOT_PARENT is easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-5-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 506b8fdb5f1b..4b0575b60a11 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -503,8 +503,7 @@ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; - /* Root. */ - if (val & 1) + if (val & MA_ROOT_PARENT) return 0; /* From eabb305293835b191ffe60234587ae8bf5e4e9fd Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:56 +0800 Subject: [PATCH 061/505] maple_tree: remove the redundant code The macros CONFIG_DEBUG_MAPLE_TREE_VERBOSE no one uses, functions mas_dup_tree() and mas_dup_store() are not implemented, just function declaration, so drop it. Link: https://lkml.kernel.org/r/20221221060058.609003-6-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4ee5a969441c..815a27661517 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include #include /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, From 46b345848261009477552d654cb2f65000c30e4d Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:57 +0800 Subject: [PATCH 062/505] maple_tree: refine ma_state init from mas_start() If mas->node is an MAS_START, there are three cases, and they all assign different values to mas->node and mas->offset. So there is no need to set them to a default value before updating. Update them directly to make them easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-7-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4b0575b60a11..d4554c11ec15 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1329,7 +1329,7 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max, depth, and offset to + * If mas->node == MAS_START, then set the min, max and depth to * defaults. * * Return: @@ -1343,22 +1343,22 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) if (likely(mas_is_start(mas))) { struct maple_enode *root; - mas->node = MAS_NONE; mas->min = 0; mas->max = ULONG_MAX; mas->depth = 0; - mas->offset = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); + mas->offset = 0; return NULL; } /* empty tree */ if (unlikely(!root)) { + mas->node = MAS_NONE; mas->offset = MAPLE_NODE_SLOTS; return NULL; } From e11cb683b2ebc6699bc0ca200442f1b80a51553f Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:58 +0800 Subject: [PATCH 063/505] maple_tree: refine mab_calc_split function Invert the conditional judgment of the mid_split, to focus the return statement in the last statement, which is easier to understand and for better readability. Link: https://lkml.kernel.org/r/20221221060058.609003-8-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d4554c11ec15..94f0053ec3e0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1882,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas, /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); - if (!(*mid_split)) - return split; - *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + if (unlikely(*mid_split)) + *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } From 318e9342fbbb6888d903d86e83865609901a1c65 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:45 -0800 Subject: [PATCH 064/505] mm/memory: add vm_normal_folio() Patch series "Convert deactivate_page() to folio_deactivate()", v4. Deactivate_page() has already been converted to use folios. This patch series modifies the callers of deactivate_page() to use folios. It also introduces vm_normal_folio() to assist with folio conversions, and converts deactivate_page() to folio_deactivate() which takes in a folio. This patch (of 4): Introduce a wrapper function called vm_normal_folio(). This function calls vm_normal_page() and returns the folio of the page found, or null if no page is found. This function allows callers to get a folio from a pte, which will eventually allow them to completely replace their struct page variables with struct folio instead. Link: https://lkml.kernel.org/r/20221221180848.20774-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221221180848.20774-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 253b2d7489e6..8e14183dfc58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1968,6 +1968,8 @@ static inline bool can_do_mlock(void) { return false; } extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index ca490596b36f..1598051a2a24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -625,6 +625,16 @@ out: return pfn_to_page(pfn); } +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + struct page *page = vm_normal_page(vma, addr, pte); + + if (page) + return page_folio(page); + return NULL; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) From 07e8c82b5eff8ef34b74210eacb8d9c4a2886b82 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:46 -0800 Subject: [PATCH 065/505] madvise: convert madvise_cold_or_pageout_pte_range() to use folios This change removes a number of calls to compound_head(), and saves 1729 bytes of kernel text. Link: https://lkml.kernel.org/r/20221221180848.20774-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/madvise.c | 98 ++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 479d9a32e44a..575ebf0363b8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -345,8 +345,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; spinlock_t *ptl; - struct page *page = NULL; - LIST_HEAD(page_list); + struct folio *folio = NULL; + LIST_HEAD(folio_list); bool pageout_anon_only_filter; if (fatal_signal_pending(current)) @@ -375,26 +375,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - page = pmd_page(orig_pmd); + folio = pfn_folio(pmd_pfn(orig_pmd)); - /* Do not interfere with other mappings of this page */ - if (page_mapcount(page) != 1) + /* Do not interfere with other mappings of this folio */ + if (folio_mapcount(folio) != 1) goto huge_unlock; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; - get_page(page); + folio_get(folio); spin_unlock(ptl); - lock_page(page); - err = split_huge_page(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); if (!err) - goto regular_page; + goto regular_folio; return 0; } @@ -406,25 +406,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - ClearPageReferenced(page); - test_and_clear_page_young(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); if (pageout) { - if (!isolate_lru_page(page)) { - if (PageUnevictable(page)) - putback_lru_page(page); + if (!folio_isolate_lru(folio)) { + if (folio_test_unevictable(folio)) + folio_putback_lru(folio); else - list_add(&page->lru, &page_list); + list_add(&folio->lru, &folio_list); } } else - deactivate_page(page); + deactivate_page(&folio->page); huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&page_list); + reclaim_pages(&folio_list); return 0; } -regular_page: +regular_folio: if (pmd_trans_unstable(pmd)) return 0; #endif @@ -441,33 +441,33 @@ regular_page: if (!pte_present(ptent)) continue; - page = vm_normal_page(vma, addr, ptent); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; /* * Creating a THP page is expensive so split it only if we * are sure it's worth. Split it if we are only owner. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) break; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) break; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); break; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); break; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; @@ -475,16 +475,16 @@ regular_page: } /* - * Do not interfere with other mappings of this page and - * non-LRU page. + * Do not interfere with other mappings of this folio and + * non-LRU folio. */ - if (!PageLRU(page) || page_mapcount(page) != 1) + if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) continue; - if (pageout_anon_only_filter && !PageAnon(page)) + if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (pte_young(ptent)) { ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -495,28 +495,28 @@ regular_page: } /* - * We are deactivating a page for accelerating reclaiming. - * VM couldn't reclaim the page unless we clear PG_young. + * We are deactivating a folio for accelerating reclaiming. + * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ - ClearPageReferenced(page); - test_and_clear_page_young(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); if (pageout) { - if (!isolate_lru_page(page)) { - if (PageUnevictable(page)) - putback_lru_page(page); + if (!folio_isolate_lru(folio)) { + if (folio_test_unevictable(folio)) + folio_putback_lru(folio); else - list_add(&page->lru, &page_list); + list_add(&folio->lru, &folio_list); } } else - deactivate_page(page); + deactivate_page(&folio->page); } arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); if (pageout) - reclaim_pages(&page_list); + reclaim_pages(&folio_list); cond_resched(); return 0; From f70da5ee8fe15b21501613ccab27eb2f722a3394 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:47 -0800 Subject: [PATCH 066/505] mm/damon: convert damon_pa_mark_accessed_or_deactivate() to use folios This change replaces 2 calls to compound_head() from put_page() and 1 call from mark_page_accessed() with one from page_folio(). This is in preparation for the conversion of deactivate_page() to folio_deactivate(). Link: https://lkml.kernel.org/r/20221221180848.20774-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: SeongJae Park Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index ebd1905eed6f..884c8bf18b12 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -283,21 +283,23 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); + struct folio *folio; if (!page) continue; + folio = page_folio(page); - if (damos_pa_filter_out(s, page)) { - put_page(page); + if (damos_pa_filter_out(s, &folio->page)) { + folio_put(folio); continue; } if (mark_accessed) - mark_page_accessed(page); + folio_mark_accessed(folio); else - deactivate_page(page); - put_page(page); - applied++; + deactivate_page(&folio->page); + folio_put(folio); + applied += folio_nr_pages(folio); } return applied * PAGE_SIZE; } From 5a9e34747c9f731bbb6b7fd7521c4fec0d840593 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:48 -0800 Subject: [PATCH 067/505] mm/swap: convert deactivate_page() to folio_deactivate() Deactivate_page() has already been converted to use folios, this change converts it to take in a folio argument instead of calling page_folio(). It also renames the function folio_deactivate() to be more consistent with other folio functions. [akpm@linux-foundation.org: fix left-over comments, per Yu Zhao] Link: https://lkml.kernel.org/r/20221221180848.20774-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- mm/damon/paddr.c | 2 +- mm/madvise.c | 4 ++-- mm/page-writeback.c | 4 ++-- mm/swap.c | 14 ++++++-------- mm/vmscan.c | 2 +- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 93f1cebd8545..87cecb8c0bdc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -401,7 +401,7 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_page(struct page *page); +void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 884c8bf18b12..6334c99e5152 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -297,7 +297,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (mark_accessed) folio_mark_accessed(folio); else - deactivate_page(&folio->page); + folio_deactivate(folio); folio_put(folio); applied += folio_nr_pages(folio); } diff --git a/mm/madvise.c b/mm/madvise.c index 575ebf0363b8..e407d335e614 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -416,7 +416,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, list_add(&folio->lru, &folio_list); } } else - deactivate_page(&folio->page); + folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) @@ -510,7 +510,7 @@ regular_folio: list_add(&folio->lru, &folio_list); } } else - deactivate_page(&folio->page); + folio_deactivate(folio); } arch_leave_lazy_mmu_mode(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ad608ef2a243..41128ea9c997 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2846,11 +2846,11 @@ bool folio_mark_dirty(struct folio *folio) if (likely(mapping)) { /* - * readahead/lru_deactivate_page could remain + * readahead/folio_deactivate could remain * PG_readahead/PG_reclaim due to race with folio_end_writeback * About readahead, if the folio is written, the flags would be * reset. So no problem. - * About lru_deactivate_page, if the folio is redirtied, + * About folio_deactivate, if the folio is redirtied, * the flag will be reset. So no problem. but if the * folio is used by readahead it will confuse readahead * and make it restart the size rampup process. But it's diff --git a/mm/swap.c b/mm/swap.c index 5e5eba186930..e54e2a252e27 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -733,17 +733,15 @@ void deactivate_file_folio(struct folio *folio) } /* - * deactivate_page - deactivate a page - * @page: page to deactivate + * folio_deactivate - deactivate a folio + * @folio: folio to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * folio_deactivate() moves @folio to the inactive list if @folio was on the + * active list and was not unevictable. This is done to accelerate the + * reclaim of @folio. */ -void deactivate_page(struct page *page) +void folio_deactivate(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_lru(folio) && !folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..aa8c252949da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1920,7 +1920,7 @@ retry: !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. - * Similar in principle to deactivate_page() + * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty */ From 0b7b8704ddcee372099a2bc6781db6ab273a85d5 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Wed, 21 Dec 2022 22:42:45 +0800 Subject: [PATCH 068/505] mm: new primitive kvmemdup() Similar to kmemdup(), but support large amount of bytes with kvmalloc() and does *not* guarantee that the result will be physically contiguous. Use only in cases where kvmalloc() is needed and free it with kvfree(). Also adapt policy_unpack.c in case someone bisect into this. Link: https://lkml.kernel.org/r/20221221144245.27164-1-sunhao.th@gmail.com Signed-off-by: Hao Sun Suggested-by: Daniel Borkmann Cc: Nick Terrell Cc: John Johansen Cc: Paul Moore Cc: James Morris Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/string.h | 1 + mm/util.c | 24 +++++++++++++++++++++++- security/apparmor/policy_unpack.c | 11 +---------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index db28802ab0a6..c062c581a98b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/mm/util.c b/mm/util.c index b56c92fb910f..cec9327b27b4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -120,7 +120,8 @@ EXPORT_SYMBOL(kstrndup); * @len: memory region length * @gfp: GFP mask to use * - * Return: newly allocated copy of @src or %NULL in case of error + * Return: newly allocated copy of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. */ void *kmemdup(const void *src, size_t len, gfp_t gfp) { @@ -133,6 +134,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * kvmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error, + * result may be not physically contiguous. Use kvfree() to free. + */ +void *kvmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kvmalloc(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kvmemdup); + /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 66915653108c..5e9949832af6 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -161,15 +161,6 @@ VISIBLE_IF_KUNIT bool aa_inbounds(struct aa_ext *e, size_t size) } EXPORT_SYMBOL_IF_KUNIT(aa_inbounds); -static void *kvmemdup(const void *src, size_t len) -{ - void *p = kvmalloc(len, GFP_KERNEL); - - if (p) - memcpy(p, src, len); - return p; -} - /** * aa_unpack_u16_chunk - test and do bounds checking for a u16 size based chunk * @e: serialized data read head (NOT NULL) @@ -1027,7 +1018,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) data->key = key; data->size = aa_unpack_blob(e, &data->data, NULL); - data->data = kvmemdup(data->data, data->size); + data->data = kvmemdup(data->data, data->size, GFP_KERNEL); if (data->size && !data->data) { kfree_sensitive(data->key); kfree_sensitive(data); From b5054174ac7c7d8fae15deee7ddc0e20fd604f30 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Dec 2022 21:24:54 +0000 Subject: [PATCH 069/505] mm: move FOLL_* defs to mm_types.h Move FOLL_* definitions to linux/mm_types.h to make them more accessible without having to drag in all of linux/mm.h and everything that drags in too[1]. Link: https://lkml.kernel.org/r/2161258.1671657894@warthog.procyon.org.uk Signed-off-by: David Howells Suggested-by: Matthew Wilcox Reviewed-by: John Hubbard Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton --- include/linux/mm.h | 75 ---------------------------------------- include/linux/mm_types.h | 75 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e14183dfc58..d68579bf8484 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,81 +3057,6 @@ static inline vm_fault_t vmf_error(int err) struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem. Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed. Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region. And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. - */ - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..1118e381fcdc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1085,4 +1085,79 @@ enum fault_flag { typedef unsigned int __bitwise zap_flags_t; +/* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY. + * + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when + * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to unpin_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via unpin_user_page(). + * + * Please see Documentation/core-api/pin_user_pages.rst for more information. + */ + +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ +#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO + * and return without waiting upon it */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ +#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ +#define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ +#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ + #endif /* _LINUX_MM_TYPES_H */ From edd898181e2f6f0969c08e1dfe2b7cdf902b9b33 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:20 +0100 Subject: [PATCH 070/505] mm: vmalloc: avoid calling __find_vmap_area() twice in __vunmap() Currently the __vunmap() path calls __find_vmap_area() twice. Once on entry to check that the area exists, then inside the remove_vm_area() function which also performs a new search for the VA. In order to improvie it from a performance point of view we split remove_vm_area() into two new parts: - find_unlink_vmap_area() that does a search and unlink from tree; - __remove_vm_area() that removes without searching. In this case there is no any functional change for remove_vm_area() whereas vm_remove_mappings(), where a second search happens, switches to the __remove_vm_area() variant where the already detached VA is passed as a parameter, so there is no need to find it again. Performance wise, i use test_vmalloc.sh with 32 threads doing alloc free on a 64-CPUs-x86_64-box: perf without this patch: - 31.41% 0.50% vmalloc_test/10 [kernel.vmlinux] [k] __vunmap - 30.92% __vunmap - 17.67% _raw_spin_lock native_queued_spin_lock_slowpath - 12.33% remove_vm_area - 11.79% free_vmap_area_noflush - 11.18% _raw_spin_lock native_queued_spin_lock_slowpath 0.76% free_unref_page perf with this patch: - 11.35% 0.13% vmalloc_test/14 [kernel.vmlinux] [k] __vunmap - 11.23% __vunmap - 8.28% find_unlink_vmap_area - 7.95% _raw_spin_lock 7.44% native_queued_spin_lock_slowpath - 1.93% free_vmap_area_noflush - 0.56% _raw_spin_lock 0.53% native_queued_spin_lock_slowpath 0.60% __vunmap_range_noflush __vunmap() consumes around ~20% less CPU cycles on this test. Also, switch from find_vmap_area() to find_unlink_vmap_area() to prevent a double access to the vmap_area_lock: one for finding area, second time is for unlinking from a tree. [urezki@gmail.com: switch to find_unlink_vmap_area() in vm_unmap_ram()] Link: https://lkml.kernel.org/r/20221222190022.134380-2-urezki@gmail.com Link: https://lkml.kernel.org/r/20221222190022.134380-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reported-by: Roman Gushchin Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 10fe83c24436..476ccbffb208 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1815,9 +1815,9 @@ static void drain_vmap_area_work(struct work_struct *work) } /* - * Free a vmap area, caller ensuring that the area has been unmapped - * and flush_cache_vunmap had been called for the correct range - * previously. + * Free a vmap area, caller ensuring that the area has been unmapped, + * unlinked and flush_cache_vunmap had been called for the correct + * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { @@ -1825,9 +1825,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) unsigned long va_start = va->va_start; unsigned long nr_lazy; - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + if (WARN_ON_ONCE(!list_empty(&va->list))) + return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -1871,6 +1870,19 @@ struct vmap_area *find_vmap_area(unsigned long addr) return va; } +static struct vmap_area *find_unlink_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr, &vmap_area_root); + if (va) + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + return va; +} + /*** Per cpu kva allocator ***/ /* @@ -2015,6 +2027,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); + spin_lock(&vmap_area_lock); + unlink_va(vb->va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } @@ -2236,7 +2252,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) return; } - va = find_vmap_area(addr); + va = find_unlink_vmap_area(addr); BUG_ON(!va); debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); @@ -2591,6 +2607,20 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } +static struct vm_struct *__remove_vm_area(struct vmap_area *va) +{ + struct vm_struct *vm; + + if (!va || !va->vm) + return NULL; + + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + + return vm; +} + /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2603,26 +2633,10 @@ struct vm_struct *find_vm_area(const void *addr) */ struct vm_struct *remove_vm_area(const void *addr) { - struct vmap_area *va; - might_sleep(); - spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr, &vmap_area_root); - if (va && va->vm) { - struct vm_struct *vm = va->vm; - - va->vm = NULL; - spin_unlock(&vmap_area_lock); - - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; - } - - spin_unlock(&vmap_area_lock); - return NULL; + return __remove_vm_area( + find_unlink_vmap_area((unsigned long) addr)); } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2636,16 +2650,17 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ +static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) { + struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - remove_vm_area(area->addr); + __remove_vm_area(va); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) @@ -2690,6 +2705,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + struct vmap_area *va; if (!addr) return; @@ -2698,19 +2714,20 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vm_area(addr); - if (unlikely(!area)) { + va = find_unlink_vmap_area((unsigned long)addr); + if (unlikely(!va)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } + area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); + va_remove_mappings(va, deallocate_pages); if (deallocate_pages) { int i; From 14687619e1122d71b2ed70e1afa6bc352e629e85 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:22 +0100 Subject: [PATCH 071/505] mm: vmalloc: replace BUG_ON() by WARN_ON_ONCE() Currently a vm_unmap_ram() functions triggers a BUG() if an area is not found. Replace it by the WARN_ON_ONCE() error message and keep machine alive instead of stopping it. The worst case is a memory leaking. Link: https://lkml.kernel.org/r/20221222190022.134380-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Reviewed-by: Christoph Hellwig Cc: Baoquan He Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 476ccbffb208..428e0bee5c9c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2253,7 +2253,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) } va = find_unlink_vmap_area(addr); - BUG_ON(!va); + if (WARN_ON_ONCE(!va)) + return; + debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); From 391655fe08d1f942359a11148aa9aaf3f99d6d6f Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:18:59 -0700 Subject: [PATCH 072/505] mm: multi-gen LRU: rename lru_gen_struct to lru_gen_folio Patch series "mm: multi-gen LRU: memcg LRU", v3. Overview ======== An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, since each node and memcg combination has an LRU of folios (see mem_cgroup_lruvec()). Its goal is to improve the scalability of global reclaim, which is critical to system-wide memory overcommit in data centers. Note that memcg reclaim is currently out of scope. Its memory bloat is a pointer to each lruvec and negligible to each pglist_data. In terms of traversing memcgs during global reclaim, it improves the best-case complexity from O(n) to O(1) and does not affect the worst-case complexity O(n). Therefore, on average, it has a sublinear complexity in contrast to the current linear complexity. The basic structure of an memcg LRU can be understood by an analogy to the active/inactive LRU (of folios): 1. It has the young and the old (generations), i.e., the counterparts to the active and the inactive; 2. The increment of max_seq triggers promotion, i.e., the counterpart to activation; 3. Other events trigger similar operations, e.g., offlining an memcg triggers demotion, i.e., the counterpart to deactivation. In terms of global reclaim, it has two distinct features: 1. Sharding, which allows each thread to start at a random memcg (in the old generation) and improves parallelism; 2. Eventual fairness, which allows direct reclaim to bail out at will and reduces latency without affecting fairness over some time. The commit message in patch 6 details the workflow: https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/ The following is a simple test to quickly verify its effectiveness. Test design: 1. Create multiple memcgs. 2. Each memcg contains a job (fio). 3. All jobs access the same amount of memory randomly. 4. The system does not experience global memory pressure. 5. Periodically write to the root memory.reclaim. Desired outcome: 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal) over mean(pgsteal) is close to 0%. 2. The total pgsteal is close to the total requested through memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close to 100%. Actual outcome [1]: MGLRU off MGLRU on stddev(pgsteal) / mean(pgsteal) 75% 20% sum(pgsteal) / sum(requested) 425% 95% #################################################################### MEMCGS=128 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do mkdir /sys/fs/cgroup/memcg$memcg done start() { echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \ --filename=/dev/zero --size=1920M --rw=randrw \ --rate=64m,64m --random_distribution=random \ --fadvise_hint=0 --time_based --runtime=10h \ --group_reporting --minimal } for ((memcg = 0; memcg < $MEMCGS; memcg++)); do start & done sleep 600 for ((i = 0; i < 600; i++)); do echo 256m >/sys/fs/cgroup/memory.reclaim sleep 6 done for ((memcg = 0; memcg < $MEMCGS; memcg++)); do grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat done #################################################################### [1]: This was obtained from running the above script (touches less than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an hour. This patch (of 8): The new name lru_gen_folio will be more distinct from the coming lru_gen_memcg. Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 6 +++--- mm/vmscan.c | 34 +++++++++++++++++----------------- mm/workingset.c | 4 ++-- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ff3f3f23f649..177b8b1dd43c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..1686fcc4ed01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -404,7 +404,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -461,7 +461,7 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -524,7 +524,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index aa8c252949da..5505f54871c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3215,7 +3215,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; @@ -3612,7 +3612,7 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); pos->refaulted = lrugen->avg_refaulted[type][tier] + @@ -3627,7 +3627,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { int hist, tier; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; @@ -3704,7 +3704,7 @@ static int folio_update_gen(struct folio *folio, int gen) static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int type = folio_is_file_lru(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); unsigned long new_flags, old_flags = READ_ONCE(folio->flags); @@ -3749,7 +3749,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) { int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -4263,7 +4263,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { int zone; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); if (type == LRU_GEN_ANON && !can_swap) @@ -4299,7 +4299,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) { int gen, type, zone; bool success = false; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4320,7 +4320,7 @@ next: ; } - /* see the comment on lru_gen_struct */ + /* see the comment on lru_gen_folio */ if (can_swap) { min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); @@ -4342,7 +4342,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) { int prev, next; int type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; spin_lock_irq(&lruvec->lru_lock); @@ -4400,7 +4400,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); @@ -4465,7 +4465,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig unsigned long old = 0; unsigned long young = 0; unsigned long total = 0; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); for (type = !can_swap; type < ANON_AND_FILE; type++) { @@ -4750,7 +4750,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4850,7 +4850,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int scanned = 0; int isolated = 0; int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -5251,7 +5251,7 @@ done: static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; if (lrugen->enabled) { enum lru_list lru; @@ -5530,7 +5530,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int i; int type, tier; int hist = lru_hist_from_seq(seq); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5580,7 +5580,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) unsigned long seq; bool full = !debugfs_real_fops(m->file)->write; struct lruvec *lruvec = v; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -5834,7 +5834,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); diff --git a/mm/workingset.c b/mm/workingset.c index 1a86645b7b3c..fd666584515c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; struct pglist_data *pgdat; int type = folio_is_file_lru(folio); From 6df1b2212950aae2b2188c6645ea18e2a9e3fdd5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:00 -0700 Subject: [PATCH 073/505] mm: multi-gen LRU: rename lrugen->lists[] to lrugen->folios[] lru_gen_folio will be chained into per-node lists by the coming lrugen->list. Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 8 ++++---- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 8 ++++---- mm/vmscan.c | 20 ++++++++++---------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d7062c6a8946..d8f721f98868 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -89,15 +89,15 @@ variables are monotonically increasing. Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` bits in order to fit into the gen counter in ``folio->flags``. Each -truncated generation number is an index to ``lrugen->lists[]``. The +truncated generation number is an index to ``lrugen->folios[]``. The sliding window technique is used to track at least ``MIN_NR_GENS`` and at most ``MAX_NR_GENS`` generations. The gen counter stores a value within ``[1, MAX_NR_GENS]`` while a page is on one of -``lrugen->lists[]``; otherwise it stores zero. +``lrugen->folios[]``; otherwise it stores zero. Each generation is divided into multiple tiers. A page accessed ``N`` times through file descriptors is in tier ``order_base_2(N)``. Unlike -generations, tiers do not have dedicated ``lrugen->lists[]``. In +generations, tiers do not have dedicated ``lrugen->folios[]``. In contrast to moving across generations, which requires the LRU lock, moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop @@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. Eviction -------- The eviction consumes old generations. Given an ``lruvec``, it -increments ``min_seq`` when ``lrugen->lists[]`` indexed by +increments ``min_seq`` when ``lrugen->folios[]`` indexed by ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to evict from, it first compares ``min_seq[]`` to select the older type. If both types are equally old, it selects the one whose first tier has diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 177b8b1dd43c..eb8a2435ee80 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1686fcc4ed01..6c96ee823dbd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -312,7 +312,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +324,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -412,7 +412,7 @@ struct lru_gen_folio { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5505f54871c9..d8a53b7443d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4271,7 +4271,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) /* prevent cold/hot inversion if force_scan is true */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct list_head *head = &lrugen->lists[old_gen][type][zone]; + struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -4282,7 +4282,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); new_gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); if (!--remaining) return false; @@ -4310,7 +4310,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) goto next; } @@ -4775,7 +4775,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4784,7 +4784,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) int hist = lru_hist_from_seq(lrugen->min_seq[type]); gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); @@ -4796,7 +4796,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { gen = folio_inc_gen(lruvec, folio, true); - list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -4863,7 +4863,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, for (zone = sc->reclaim_idx; zone >= 0; zone--) { LIST_HEAD(moved); int skipped = 0; - struct list_head *head = &lrugen->lists[gen][type][zone]; + struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); @@ -5264,7 +5264,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) int gen, type, zone; for_each_gen_type_zone(gen, type, zone) { - if (!list_empty(&lrugen->lists[gen][type][zone])) + if (!list_empty(&lrugen->folios[gen][type][zone])) return false; } } @@ -5309,7 +5309,7 @@ static bool drain_evictable(struct lruvec *lruvec) int remaining = MAX_LRU_BATCH; for_each_gen_type_zone(gen, type, zone) { - struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; while (!list_empty(head)) { bool success; @@ -5843,7 +5843,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); lruvec->mm_state.seq = MIN_NR_GENS; init_waitqueue_head(&lruvec->mm_state.wait); From a579086c99ed70cc4bfc104348dbe3dd8f2787e6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:01 -0700 Subject: [PATCH 074/505] mm: multi-gen LRU: remove eviction fairness safeguard Recall that the eviction consumes the oldest generation: first it bucket-sorts folios whose gen counters were updated by the aging and reclaims the rest; then it increments lrugen->min_seq. The current eviction fairness safeguard for global reclaim has a dilemma: when there are multiple eligible memcgs, should it continue or stop upon meeting the reclaim goal? If it continues, it overshoots and increases direct reclaim latency; if it stops, it loses fairness between memcgs it has taken memory away from and those it has yet to. With memcg LRU, the eviction, while ensuring eventual fairness, will stop upon meeting its goal. Therefore the current eviction fairness safeguard for global reclaim will not be needed. Note that memcg LRU only applies to global reclaim. For memcg reclaim, the eviction will continue, even if it is overshooting. This becomes unconditional due to code simplification. Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 81 +++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d8a53b7443d4..bfbfc98c856c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -449,6 +449,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return sc->target_mem_cgroup; } +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); +} + /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question @@ -499,6 +504,11 @@ static bool cgroup_reclaim(struct scan_control *sc) return false; } +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} + static bool writeback_throttling_sane(struct scan_control *sc) { return true; @@ -5006,8 +5016,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return scanned; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, - bool *need_swapping) +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { int type; int scanned; @@ -5096,9 +5105,6 @@ retry: goto retry; } - if (need_swapping && type == LRU_GEN_ANON) - *need_swapping = true; - return scanned; } @@ -5138,67 +5144,26 @@ done: return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; } -static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, - struct scan_control *sc, bool need_swapping) +static unsigned long get_nr_to_reclaim(struct scan_control *sc) { - int i; - DEFINE_MAX_SEQ(lruvec); + /* don't abort memcg reclaim to ensure fairness */ + if (!global_reclaim(sc)) + return -1; - if (!current_is_kswapd()) { - /* age each memcg at most once to ensure fairness */ - if (max_seq - seq > 1) - return true; + /* discount the previous progress for kswapd */ + if (current_is_kswapd()) + return sc->nr_to_reclaim + sc->last_reclaimed; - /* over-swapping can increase allocation latency */ - if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) - return true; - - /* give this thread a chance to exit and free its memory */ - if (fatal_signal_pending(current)) { - sc->nr_reclaimed += MIN_LRU_BATCH; - return true; - } - - if (cgroup_reclaim(sc)) - return false; - } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) - return false; - - /* keep scanning at low priorities to ensure fairness */ - if (sc->priority > DEF_PRIORITY - 2) - return false; - - /* - * A minimum amount of work was done under global memory pressure. For - * kswapd, it may be overshooting. For direct reclaim, the allocation - * may succeed if all suitable zones are somewhat safe. In either case, - * it's better to stop now, and restart later if necessary. - */ - for (i = 0; i <= sc->reclaim_idx; i++) { - unsigned long wmark; - struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; - - if (!managed_zone(zone)) - continue; - - wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); - if (wmark > zone_page_state(zone, NR_FREE_PAGES)) - return false; - } - - sc->nr_reclaimed += MIN_LRU_BATCH; - - return true; + return max(sc->nr_to_reclaim, compact_gap(sc->order)); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; bool need_aging = false; - bool need_swapping = false; unsigned long scanned = 0; unsigned long reclaimed = sc->nr_reclaimed; - DEFINE_MAX_SEQ(lruvec); + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); lru_add_drain(); @@ -5222,7 +5187,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc if (!nr_to_scan) goto done; - delta = evict_folios(lruvec, sc, swappiness, &need_swapping); + delta = evict_folios(lruvec, sc, swappiness); if (!delta) goto done; @@ -5230,7 +5195,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc if (scanned >= nr_to_scan) break; - if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + if (sc->nr_reclaimed >= nr_to_reclaim) break; cond_resched(); @@ -5677,7 +5642,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness, NULL)) + if (!evict_folios(lruvec, sc, swappiness)) return 0; cond_resched(); From 7348cc91821b0cb24dfb00e578047f68299a50ab Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:02 -0700 Subject: [PATCH 075/505] mm: multi-gen LRU: remove aging fairness safeguard Recall that the aging produces the youngest generation: first it scans for accessed folios and updates their gen counters; then it increments lrugen->max_seq. The current aging fairness safeguard for kswapd uses two passes to ensure the fairness to multiple eligible memcgs. On the first pass, which is shared with the eviction, it checks whether all eligible memcgs are low on cold folios. If so, it requires a second pass, on which it ages all those memcgs at the same time. With memcg LRU, the aging, while ensuring eventual fairness, will run when necessary. Therefore the current aging fairness safeguard for kswapd will not be needed. Note that memcg LRU only applies to global reclaim. For memcg reclaim, the aging can be unfair to different memcgs, i.e., their lrugen->max_seq can be incremented at different paces. Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 136 +++++++++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bfbfc98c856c..cc522e048ed7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -137,7 +137,6 @@ struct scan_control { #ifdef CONFIG_LRU_GEN /* help kswapd make better choices among multiple memcgs */ - unsigned int memcgs_need_aging:1; unsigned long last_reclaimed; #endif @@ -4468,7 +4467,7 @@ done: return true; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) { int gen, type, zone; @@ -4477,6 +4476,13 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig unsigned long total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + /* whether this lruvec is completely out of cold folios */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { + *nr_to_scan = 0; + return true; + } for (type = !can_swap; type < ANON_AND_FILE; type++) { unsigned long seq; @@ -4505,8 +4511,6 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig * stalls when the number of generations reaches MIN_NR_GENS. Hence, the * ideal number of generations is MIN_NR_GENS+1. */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) - return true; if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) return false; @@ -4525,40 +4529,54 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig return false; } -static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { - bool need_aging; - unsigned long nr_to_scan; - int swappiness = get_swappiness(lruvec, sc); + int gen, type, zone; + unsigned long total = 0; + bool can_swap = get_swappiness(lruvec, sc); + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + } + } + + /* whether the size is big enough to be helpful */ + return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; +} + +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, + unsigned long min_ttl) +{ + int gen; + unsigned long birth; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + /* see the comment on lru_gen_folio */ + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + + if (!lruvec_is_sizable(lruvec, sc)) + return false; + mem_cgroup_calculate_protection(NULL, memcg); - if (mem_cgroup_below_min(NULL, memcg)) - return false; - - need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); - - if (min_ttl) { - int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); - unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - - if (time_is_after_jiffies(birth + min_ttl)) - return false; - - /* the size is likely too small to be helpful */ - if (!nr_to_scan && sc->priority != DEF_PRIORITY) - return false; - } - - if (need_aging) - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); - - return true; + return !mem_cgroup_below_min(NULL, memcg); } /* to protect the working set of the last N jiffies */ @@ -4567,46 +4585,32 @@ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; - bool success = false; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); VM_WARN_ON_ONCE(!current_is_kswapd()); sc->last_reclaimed = sc->nr_reclaimed; - /* - * To reduce the chance of going into the aging path, which can be - * costly, optimistically skip it if the flag below was cleared in the - * eviction path. This improves the overall performance when multiple - * memcgs are available. - */ - if (!sc->memcgs_need_aging) { - sc->memcgs_need_aging = true; + /* check the order to exclude compaction-induced reclaim */ + if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; - } - - set_mm_walk(pgdat); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - if (age_lruvec(lruvec, sc, min_ttl)) - success = true; + if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { + mem_cgroup_iter_break(NULL, memcg); + return; + } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - clear_mm_walk(); - - /* check the order to exclude compaction-induced reclaim */ - if (success || !min_ttl || sc->order) - return; - /* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are - * either below min or empty. + * either too small or below min. */ if (mutex_trylock(&oom_lock)) { struct oom_control oc = { @@ -5114,34 +5118,28 @@ retry: * reclaim. */ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap, bool *need_aging) + bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && !sc->memcg_low_reclaim)) return 0; - *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); - if (!*need_aging) + if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) return nr_to_scan; /* skip the aging path at the default priority */ if (sc->priority == DEF_PRIORITY) - goto done; - - /* leave the work to lru_gen_age_node() */ - if (current_is_kswapd()) - return 0; - - if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) return nr_to_scan; -done: - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); + + /* skip this lruvec as it's low on cold folios */ + return 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5160,9 +5158,7 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; - bool need_aging = false; unsigned long scanned = 0; - unsigned long reclaimed = sc->nr_reclaimed; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); lru_add_drain(); @@ -5183,13 +5179,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc else swappiness = 0; - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (!nr_to_scan) - goto done; + break; delta = evict_folios(lruvec, sc, swappiness); if (!delta) - goto done; + break; scanned += delta; if (scanned >= nr_to_scan) @@ -5201,10 +5197,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc cond_resched(); } - /* see the comment in lru_gen_age_node() */ - if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) - sc->memcgs_need_aging = false; -done: clear_mm_walk(); blk_finish_plug(&plug); From 77d4459a4a1a472b7309e475f962dda87d950abd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:03 -0700 Subject: [PATCH 076/505] mm: multi-gen LRU: shuffle should_run_aging() Move should_run_aging() next to its only caller left. Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 124 ++++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cc522e048ed7..5a167f8efc38 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4467,68 +4467,6 @@ done: return true; } -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) -{ - int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; - struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MIN_SEQ(lruvec); - - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; - return true; - } - - for (type = !can_swap; type < ANON_AND_FILE; type++) { - unsigned long seq; - - for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - - gen = lru_gen_from_seq(seq); - - for (zone = 0; zone < MAX_NR_ZONES; zone++) - size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; - } - } - - /* try to scrape all its memory if this memcg was deleted */ - *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; -} - static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; @@ -5112,6 +5050,68 @@ retry: return scanned; } +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + /* whether this lruvec is completely out of cold folios */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { + *nr_to_scan = 0; + return true; + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg From e4dde56cd208674ce899b47589f263499e5b8cdc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:04 -0700 Subject: [PATCH 077/505] mm: multi-gen LRU: per-node lru_gen_folio lists For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen" to "old" and resets its "seg" to "default"; 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the young generation, updates its "gen" to "young" and resets its "seg" to "default". The events that trigger the above operations are: 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; 2. The first attempt to reclaim an memcg below low, which triggers MEMCG_LRU_TAIL; 3. The first attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_TAIL; 4. The second attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_YOUNG; 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. Note that memcg LRU only applies to global reclaim, and the round-robin incrementing of their max_seq counters ensures the eventual fairness to all eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 + include/linux/mm_inline.h | 17 ++ include/linux/mmzone.h | 117 +++++++++++- mm/memcontrol.c | 16 ++ mm/page_alloc.c | 1 + mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++---- 6 files changed, 500 insertions(+), 35 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..2e08b05bc6bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index eb8a2435ee80..acf03147fff8 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } +#ifdef CONFIG_MEMCG +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} +#else +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} +#endif + static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void) return false; } +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c96ee823dbd..815c7c2edf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -367,6 +368,15 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + #ifdef CONFIG_LRU_GEN enum { @@ -426,6 +436,14 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * + * Note that memcg LRU only applies to global reclaim, and the round-robin + * incrementing of their max_seq counters ensures the eventual fairness to all + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -1243,6 +1354,8 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 49f67176a1a2..2758b67eb169 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; + if (lru_gen_enabled()) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment on MEMCG_NR_GENS */ + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); + + return; + } + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; @@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, struct mem_cgroup_tree_per_node *mctz; unsigned long excess; + if (lru_gen_enabled()) + return 0; + if (order > 0) return 0; @@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + lru_gen_online_memcg(memcg); return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); + lru_gen_offline_memcg(memcg); drain_all_stock(memcg); @@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + lru_gen_release_memcg(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d980dc0000e..5668c1a2de49 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); + lru_gen_init_pgdat(pgdat); } static void __init free_area_init_memoryless_node(int nid) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a167f8efc38..178465a503db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include @@ -135,11 +137,6 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; -#ifdef CONFIG_LRU_GEN - /* help kswapd make better choices among multiple memcgs */ - unsigned long last_reclaimed; -#endif - /* Allocation order */ s8 order; @@ -3185,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -4453,8 +4453,7 @@ done: if (sc->priority <= DEF_PRIORITY - 2) wait_event_killable(lruvec->mm_state.wait, max_seq < READ_ONCE(lrugen->max_seq)); - - return max_seq < READ_ONCE(lrugen->max_seq); + return false; } VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); @@ -4527,8 +4526,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) VM_WARN_ON_ONCE(!current_is_kswapd()); - sc->last_reclaimed = sc->nr_reclaimed; - /* check the order to exclude compaction-induced reclaim */ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; @@ -5117,8 +5114,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5136,10 +5132,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * if (sc->priority == DEF_PRIORITY) return nr_to_scan; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); - /* skip this lruvec as it's low on cold folios */ - return 0; + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5148,29 +5142,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) if (!global_reclaim(sc)) return -1; - /* discount the previous progress for kswapd */ - if (current_is_kswapd()) - return sc->nr_to_reclaim + sc->last_reclaimed; - return max(sc->nr_to_reclaim, compact_gap(sc->order)); } -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - struct blk_plug plug; + long nr_to_scan; unsigned long scanned = 0; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); - lru_add_drain(); - - blk_start_plug(&plug); - - set_mm_walk(lruvec_pgdat(lruvec)); - while (true) { int delta; int swappiness; - unsigned long nr_to_scan; if (sc->may_swap) swappiness = get_swappiness(lruvec, sc); @@ -5180,7 +5163,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc swappiness = 0; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); - if (!nr_to_scan) + if (nr_to_scan <= 0) break; delta = evict_folios(lruvec, sc, swappiness); @@ -5197,11 +5180,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc cond_resched(); } + /* whether try_to_inc_max_seq() was successful */ + return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + unsigned long scanned = sc->nr_scanned; + unsigned long reclaimed = sc->nr_reclaimed; + int seg = lru_gen_memcg_seg(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + /* see the comment on MEMCG_NR_GENS */ + if (!lruvec_is_sizable(lruvec, sc)) + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; + + if (mem_cgroup_below_low(NULL, memcg)) { + /* see the comment on MEMCG_NR_GENS */ + if (seg != MEMCG_LRU_TAIL) + return MEMCG_LRU_TAIL; + + memcg_memory_event(memcg, MEMCG_LOW); + } + + success = try_to_shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; + current->reclaim_state->reclaimed_slab = 0; + + return success ? MEMCG_LRU_YOUNG : 0; +} + +#ifdef CONFIG_MEMCG + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + const struct hlist_nulls_node *pos; + int op = 0; + struct mem_cgroup *memcg = NULL; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); +restart: + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + + rcu_read_lock(); + + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + lruvec = container_of(lrugen, struct lruvec, lrugen); + memcg = lruvec_memcg(lruvec); + + if (!mem_cgroup_tryget(memcg)) { + op = 0; + memcg = NULL; + continue; + } + + rcu_read_unlock(); + + op = shrink_one(lruvec, sc); + + if (sc->nr_reclaimed >= nr_to_reclaim) + goto success; + + rcu_read_lock(); + } + + rcu_read_unlock(); + + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; + + /* try the rest of the bins of the current generation */ + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; +success: + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + + VM_WARN_ON_ONCE(global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + clear_mm_walk(); blk_finish_plug(&plug); } +#else /* !CONFIG_MEMCG */ + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + BUILD_BUG(); +} + +#endif + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the + * estimated reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_swappiness(lruvec, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + reclaimable /= MEMCG_NR_GENS; + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + sc->priority = clamp(priority, 0, DEF_PRIORITY); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long reclaimed = sc->nr_reclaimed; + + VM_WARN_ON_ONCE(!global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(pgdat); + + set_initial_priority(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed = 0; + + if (mem_cgroup_disabled()) + shrink_one(&pgdat->__lruvec, sc); + else + shrink_many(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed += reclaimed; + + clear_mm_walk(); + + blk_finish_plug(&plug); + + /* kswapd should never fail */ + pgdat->kswapd_failures = 0; +} + +#ifdef CONFIG_MEMCG +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} +#endif + /****************************************************************************** * state change ******************************************************************************/ @@ -5655,11 +5879,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (!mem_cgroup_disabled()) { rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); -#ifdef CONFIG_MEMCG - if (memcg && !css_tryget(&memcg->css)) + if (!mem_cgroup_tryget(memcg)) memcg = NULL; -#endif + rcu_read_unlock(); if (!memcg) @@ -5807,6 +6031,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) } #ifdef CONFIG_MEMCG + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_memcg(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->mm_list.fifo); @@ -5830,7 +6067,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } } -#endif + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +#endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { @@ -5857,6 +6156,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { } +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5870,7 +6173,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !global_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6113,6 +6416,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; + if (lru_gen_enabled() && global_reclaim(sc)) { + lru_gen_shrink_node(pgdat, sc); + return; + } + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: From e9d4e1ee788097484606c32122f146d802a9c5fb Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:05 -0700 Subject: [PATCH 078/505] mm: multi-gen LRU: clarify scan_control flags Among the flags in scan_control: 1. sc->may_swap, which indicates swap constraint due to memsw.max, is supported as usual. 2. sc->proactive, which indicates reclaim by memory.reclaim, may not opportunistically skip the aging path, since it is considered less latency sensitive. 3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers swappiness to prioritize file LRU, since clean file folios are more likely to exist. 4. sc->may_writepage and sc->may_unmap, which indicates opportunistic reclaim, are rejected, since unmapped clean folios are already prioritized. Scanning for more of them is likely futile and can cause high reclaim latency when there is a large number of memcgs. The rest are handled by the existing code. Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 56 ++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 178465a503db..2964652d1aa8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3210,6 +3210,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + if (!sc->may_swap) + return 0; + if (!can_demote(pgdat->node_id, sc) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; @@ -4236,7 +4239,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ } while (err == -EAGAIN); } -static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; @@ -4244,7 +4247,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) VM_WARN_ON_ONCE(walk); walk = &pgdat->mm_walk; - } else if (!pgdat && !walk) { + } else if (!walk && force_alloc) { VM_WARN_ON_ONCE(current_is_kswapd()); walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -4430,7 +4433,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, goto done; } - walk = set_mm_walk(NULL); + walk = set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; @@ -4499,8 +4502,6 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); - VM_WARN_ON_ONCE(sc->memcg_low_reclaim); - /* see the comment on lru_gen_folio */ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); @@ -4756,12 +4757,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* unmapping inhibited */ - if (!sc->may_unmap && folio_mapped(folio)) - return false; - /* swapping inhibited */ - if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) return false; @@ -4858,9 +4855,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(PGSCAN_ANON + type, isolated); /* - * There might not be eligible pages due to reclaim_idx, may_unmap and - * may_writepage. Check the remaining to prevent livelock if it's not - * making progress. + * There might not be eligible folios due to reclaim_idx. Check the + * remaining to prevent livelock if it's not making progress. */ return isolated || !remaining ? scanned : 0; } @@ -5120,9 +5116,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); - if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || - (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && - !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return 0; if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) @@ -5150,17 +5144,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) long nr_to_scan; unsigned long scanned = 0; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + int swappiness = get_swappiness(lruvec, sc); + + /* clean file folios are more likely to exist */ + if (swappiness && !(sc->gfp_mask & __GFP_IO)) + swappiness = 1; while (true) { int delta; - int swappiness; - - if (sc->may_swap) - swappiness = get_swappiness(lruvec, sc); - else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) - swappiness = 1; - else - swappiness = 0; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (nr_to_scan <= 0) @@ -5291,12 +5282,13 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc struct blk_plug plug; VM_WARN_ON_ONCE(global_reclaim(sc)); + VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); blk_start_plug(&plug); - set_mm_walk(lruvec_pgdat(lruvec)); + set_mm_walk(NULL, sc->proactive); if (try_to_shrink_lruvec(lruvec, sc)) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); @@ -5352,11 +5344,19 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * VM_WARN_ON_ONCE(!global_reclaim(sc)); + /* + * Unmapped clean folios are already prioritized. Scanning for more of + * them is likely futile and can cause high reclaim latency when there + * is a large number of memcgs. + */ + if (!sc->may_writepage || !sc->may_unmap) + goto done; + lru_add_drain(); blk_start_plug(&plug); - set_mm_walk(pgdat); + set_mm_walk(pgdat, sc->proactive); set_initial_priority(pgdat, sc); @@ -5374,7 +5374,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * clear_mm_walk(); blk_finish_plug(&plug); - +done: /* kswapd should never fail */ pgdat->kswapd_failures = 0; } @@ -5943,7 +5943,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); - if (!set_mm_walk(NULL)) { + if (!set_mm_walk(NULL, true)) { err = -ENOMEM; goto done; } From f386e9314025ea99dae639ed2032560a92081430 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:06 -0700 Subject: [PATCH 079/505] mm: multi-gen LRU: simplify arch_has_hw_pte_young() check Scanning page tables when hardware does not set the accessed bit has no real use cases. Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2964652d1aa8..7c3fd900a89d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4428,7 +4428,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ - if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; } From a9af8e6bb3e5de8ea9d29c1d318bcfbc5667c939 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Fri, 23 Dec 2022 10:50:24 +0800 Subject: [PATCH 080/505] selftests/vm: ksm_functional_tests: fix a typo in comment Fix a typo of "comaring" which should be "comparing". Link: https://lkml.kernel.org/r/202212231050245952617@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: xu xin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_functional_tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c index b11b7e5115dc..d8b5b4930412 100644 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -37,7 +37,7 @@ static bool range_maps_duplicates(char *addr, unsigned long size) /* * There is no easy way to check if there are KSM pages mapped into * this range. We only check that the range does not map the same PFN - * twice by comaring each pair of mapped pages. + * twice by comparing each pair of mapped pages. */ for (offs_a = 0; offs_a < size; offs_a += pagesize) { pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); From 931298e103c228c4ce6d13e7b5781aeaaff37ac7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Dec 2022 16:56:15 +0100 Subject: [PATCH 081/505] mm/userfaultfd: rely on vma->vm_page_prot in uffd_wp_range() Patch series "mm: uffd-wp + change_protection() cleanups". Cleanup page protection handling in uffd-wp when calling change_protection() and improve unprotecting uffd=wp in private mappings, trying to set PTEs writable again if possible just like we do during mprotect() when upgrading write permissions. Make the change_protection() interface harder to get wrong :) I consider both pages primarily cleanups, although patch #1 fixes a corner case with uffd-wp and softdirty tracking for shmem. @Peter, please let me know if we should flag patch #1 as pure cleanup -- I have no idea how important softdirty tracking on shmem is. This patch (of 2): uffd_wp_range() currently calculates page protection manually using vm_get_page_prot(). This will ignore any other reason for active writenotify: one mechanism applicable to shmem is softdirty tracking. For example, the following sequence 1) Write to mapped shmem page 2) Clear softdirty 3) Register uffd-wp covering the mapped page 4) Unregister uffd-wp covering the mapped page 5) Write to page again will not set the modified page softdirty, because uffd_wp_range() will ignore that writenotify is required for softdirty tracking and simply map the page writable again using change_protection(). Similarly, instead of unregistering, protecting followed by un-protecting the page using uffd-wp would result in the same situation. Now that we enable writenotify whenever enabling uffd-wp on a VMA, vma->vm_page_prot will already properly reflect our requirements: the default is to write-protect all PTEs. However, for shared mappings we would now not remap the PTEs writable if possible when unprotecting, just like for private mappings (COW). To compensate, set MM_CP_TRY_CHANGE_WRITABLE just like mprotect() does to try mapping individual PTEs writable. For private mappings, this change implies that we will now always try setting PTEs writable when un-protecting, just like when upgrading write permissions using mprotect(), which is an improvement. For shared mappings, we will only set PTEs writable if can_change_pte_writable()/can_change_pmd_writable() indicates that it's ok. For ordinary shmem, this will be the case when PTEs are dirty, which should usually be the case -- otherwise we could special-case shmem in can_change_pte_writable()/can_change_pmd_writable() easily, because shmem itself doesn't require writenotify. Note that hugetlb does not yet implement MM_CP_TRY_CHANGE_WRITABLE, so we won't try setting PTEs writable when unprotecting or when unregistering uffd-wp. This can be added later on top by implementing MM_CP_TRY_CHANGE_WRITABLE. While commit ffd05793963a ("userfaultfd: wp: support write protection for userfault vma range") introduced that code, it should only be applicable to uffd-wp on shared mappings -- shmem (hugetlb does not support softdirty tracking). I don't think this corner cases justifies to cc stable. Let's just handle it correctly and prepare for change_protection() cleanups. [david@redhat.com: o need for additional harmless checks if we're wr-protecting either way] Link: https://lkml.kernel.org/r/71412742-a71f-9c74-865f-773ad83db7a5@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-1-david@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-2-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Nadav Amit Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index f8d31b82aceb..46771362550f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -713,17 +713,25 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { + unsigned int mm_cp_flags; struct mmu_gather tlb; - pgprot_t newprot; if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + mm_cp_flags = MM_CP_UFFD_WP; else - newprot = vm_get_page_prot(dst_vma->vm_flags); + mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; + /* + * vma->vm_page_prot already reflects that uffd-wp is enabled for this + * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed + * to be write-protected as default whenever protection changes. + * Try upgrading write permissions manually. + */ + if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + change_protection(&tlb, dst_vma, start, start + len, vma->vm_page_prot, + mm_cp_flags); tlb_finish_mmu(&tlb); } From 1ef488edd6c4d447784710974f049628c2890481 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Dec 2022 16:56:16 +0100 Subject: [PATCH 082/505] mm/mprotect: drop pgprot_t parameter from change_protection() Being able to provide a custom protection opens the door for inconsistencies and BUGs: for example, accidentally allowing for more permissions than desired by other mechanisms (e.g., softdirty tracking). vma->vm_page_prot should be the single source of truth. Only PROT_NUMA is special: there is no way we can erroneously allow for more permissions when removing all permissions. Special-case using the MM_CP_PROT_NUMA flag. [david@redhat.com: PAGE_NONE might not be defined without CONFIG_NUMA_BALANCING] Link: https://lkml.kernel.org/r/5084ff1c-ebb3-f918-6a60-bacabf550a88@redhat.com Link: https://lkml.kernel.org/r/20221223155616.297723-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Nadav Amit Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +-- mm/mempolicy.c | 3 +-- mm/mprotect.c | 18 +++++++++++++++--- mm/userfaultfd.c | 3 +-- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d68579bf8484..329ed67edd76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2134,8 +2134,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags); + unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index becf41e10076..d3558248a0f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -635,8 +635,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); - nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, - MM_CP_PROT_NUMA); + nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/mprotect.c b/mm/mprotect.c index bf8fa0af5a15..71358e45a742 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -507,13 +507,25 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags) + unsigned long end, unsigned long cp_flags) { + pgprot_t newprot = vma->vm_page_prot; unsigned long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); +#ifdef CONFIG_NUMA_BALANCING + /* + * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking) + * are expected to reflect their requirements via VMA flags such that + * vma_set_page_prot() will adjust vma->vm_page_prot accordingly. + */ + if (cp_flags & MM_CP_PROT_NUMA) + newprot = PAGE_NONE; +#else + WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA); +#endif + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot, cp_flags); @@ -642,7 +654,7 @@ success: mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); + change_protection(tlb, vma, start, end, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 46771362550f..65ad172add27 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -730,8 +730,7 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, vma->vm_page_prot, - mm_cp_flags); + change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); } From 3783e1721b650588938d28e4a084a1c9748361c8 Mon Sep 17 00:00:00 2001 From: Kele Huang Date: Sat, 24 Dec 2022 01:02:33 -0500 Subject: [PATCH 083/505] mm: fix comment of page table counter Commit af5b0f6a09e42 ("mm: consolidate page table accounting") consolidates page table accounting to a single counter in struct mm_struct {} as mm->pgtables_bytes. So the meanning of this counter should be the size of all page tables now. Link: https://lkml.kernel.org/r/20221224060233.417827-1-kele.huang@columbia.edu Signed-off-by: Kele Huang Cc: Arnd Bergmann Cc: Colin Cross Cc: David Hildenbrand Cc: Hugh Dickins Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Pasha Tatashin Cc: Peter Xu Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1118e381fcdc..10b6eb311ede 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t pgtables_bytes; /* PTE page table pages */ + atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ From 01b5022f0a8a2911bb8f2bc3f0c9b9b2c21c3316 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 28 Dec 2022 17:59:42 +0000 Subject: [PATCH 084/505] mm/page_reporting: replace rcu_access_pointer() with rcu_dereference_protected() Page reporting fetches pr_dev_info using rcu_access_pointer(), which is for safely fetching a pointer that will not be dereferenced but could concurrently updated. The code indeed does not dereference pr_dev_info after fetching it using rcu_access_pointer(), but it fetches the pointer while concurrent updates to the pointer is avoided by holding the update side lock, page_reporting_mutex. In the case, rcu_dereference_protected() should be used instead because it provides better readability and performance on some cases, as rcu_dereference_protected() avoids use of READ_ONCE(). Replace the rcu_access_pointer() calls with rcu_dereference_protected(). Link: https://lkml.kernel.org/r/20221228175942.149491-1-sj@kernel.org Fixes: 36e66c554b5c ("mm: introduce Reported pages") Signed-off-by: SeongJae Park Cc: Alexander Duyck Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_reporting.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 79a8554f024c..c65813a9dc78 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -356,7 +356,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) mutex_lock(&page_reporting_mutex); /* nothing to do if already in use */ - if (rcu_access_pointer(pr_dev_info)) { + if (rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { err = -EBUSY; goto err_out; } @@ -401,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) { mutex_lock(&page_reporting_mutex); - if (rcu_access_pointer(pr_dev_info) == prdev) { + if (prdev == rcu_dereference_protected(pr_dev_info, + lockdep_is_held(&page_reporting_mutex))) { /* Disable page reporting notification */ RCU_INIT_POINTER(pr_dev_info, NULL); synchronize_rcu(); From 81e506bec9be1eceaf5a2c654e28ba5176ef48d8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Fri, 23 Dec 2022 21:52:07 +0800 Subject: [PATCH 085/505] mm/thp: check and bail out if page in deferred queue already Kernel build regression with LLVM was reported here: https://lore.kernel.org/all/Y1GCYXGtEVZbcv%2F5@dev-arch.thelio-3990X/ with commit f35b5d7d676e ("mm: align larger anonymous mappings on THP boundaries"). And the commit f35b5d7d676e was reverted. It turned out the regression is related with madvise(MADV_DONTNEED) was used by ld.lld. But with none PMD_SIZE aligned parameter len. trace-bpfcc captured: 531607 531732 ld.lld do_madvise.part.0 start: 0x7feca9000000, len: 0x7fb000, behavior: 0x4 531607 531793 ld.lld do_madvise.part.0 start: 0x7fec86a00000, len: 0x7fb000, behavior: 0x4 If the underneath physical page is THP, the madvise(MADV_DONTNEED) can trigger split_queue_lock contention raised significantly. perf showed following data: 14.85% 0.00% ld.lld [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe 11.52% entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_madvise do_madvise.part.0 zap_page_range unmap_single_vma unmap_page_range page_remove_rmap deferred_split_huge_page __lock_text_start native_queued_spin_lock_slowpath If THP can't be removed from rmap as whole THP, partial THP will be removed from rmap by removing sub-pages from rmap. Even the THP head page is added to deferred queue already, the split_queue_lock will be acquired and check whether the THP head page is in the queue already. Thus, the contention of split_queue_lock is raised. Before acquire split_queue_lock, check and bail out early if the THP head page is in the queue already. The checking without holding split_queue_lock could race with deferred_split_scan, but it doesn't impact the correctness here. Test result of building kernel with ld.lld: commit 7b5a0b664ebe (parent commit of f35b5d7d676e): time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:07.99 real, 26367.77 user, 5063.35 sys commit f35b5d7d676e: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 7:22.15 real, 26235.03 user, 12504.55 sys commit f35b5d7d676e with the fixing patch: time -f "\t%E real,\t%U user,\t%S sys" make LD=ld.lld -skj96 allmodconfig all 6:08.49 real, 26520.15 user, 5047.91 sys Link: https://lkml.kernel.org/r/20221223135207.2275317-1-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Tested-by: Nathan Chancellor Acked-by: David Rientjes Reviewed-by: "Huang, Ying" Cc: Feng Tang Cc: Matthew Wilcox Cc: Rik van Riel Cc: Xing Zhengjun Cc: Yang Shi Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 266c4b557946..0ea6510a3be7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2835,6 +2835,9 @@ void deferred_split_huge_page(struct page *page) if (PageSwapCache(page)) return; + if (!list_empty(page_deferred_list(page))) + return; + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); From 5b68de67037168f826d6fe434d03b5876aec4cb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:26 -1000 Subject: [PATCH 086/505] fs: remove an outdated comment on mpage_writepages Patch series "remove generic_writepages" This series removes generic_writepages by open coding the current functionality in the three remaining callers. Besides removing some code the main benefit is that one of the few remaining ->writepage callers from outside the core page cache code go away. This patch (of 6): mpage_writepages doesn't do any of the page locking itself, so remove and outdated comment on the locking pattern there. Link: https://lkml.kernel.org/r/20221229161031.391878-1-hch@lst.de Link: https://lkml.kernel.org/r/20221229161031.391878-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/mpage.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index db59cbf6affc..d36a95473f77 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -641,14 +641,6 @@ out: * * This is a library function, which implements the writepages() * address_space_operation. - * - * If a page is already under I/O, generic_writepages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. */ int mpage_writepages(struct address_space *mapping, From d4428bad14dd1509d7a1176dba69a01d67c0b86d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:27 -1000 Subject: [PATCH 087/505] ntfs3: stop using generic_writepages Open code the resident inode handling in ntfs_writepages by directly using write_cache_pages to prepare removing the ->writepage handler in ntfs3. Link: https://lkml.kernel.org/r/20221229161031.391878-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 20b953871574..b6dad2da5950 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -852,12 +852,29 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, ntfs_get_block, wbc); } +static int ntfs_resident_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct address_space *mapping = data; + struct ntfs_inode *ni = ntfs_i(mapping->host); + int ret; + + ni_lock(ni); + ret = attr_data_write_resident(ni, page); + ni_unlock(ni); + + if (ret != E_NTFS_NONRESIDENT) + unlock_page(page); + mapping_set_error(mapping, ret); + return ret; +} + static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - /* Redirect call to 'ntfs_writepage' for resident files. */ if (is_resident(ntfs_i(mapping->host))) - return generic_writepages(mapping, wbc); + return write_cache_pages(mapping, wbc, ntfs_resident_writepage, + mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); } From 25a89826f270ddbf76dca7d64e4f8a8dccda3d1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:28 -1000 Subject: [PATCH 088/505] ntfs3: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221229161031.391878-4-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index b6dad2da5950..6b50b6e32378 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -832,26 +832,6 @@ out: return err; } -static int ntfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); - int err; - - if (is_resident(ni)) { - ni_lock(ni); - err = attr_data_write_resident(ni, page); - ni_unlock(ni); - if (err != E_NTFS_NONRESIDENT) { - unlock_page(page); - return err; - } - } - - return block_write_full_page(page, ntfs_get_block, wbc); -} - static int ntfs_resident_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -2083,13 +2063,13 @@ const struct inode_operations ntfs_link_inode_operations = { const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, - .writepage = ntfs_writepage, .writepages = ntfs_writepages, .write_begin = ntfs_write_begin, .write_end = ntfs_write_end, .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, + .migrate_folio = buffer_migrate_folio, .invalidate_folio = block_invalidate_folio, }; From cff61bbc717bfddd6e433fe142b8e70b21546a1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:29 -1000 Subject: [PATCH 089/505] jbd2,ocfs2: move jbd2_journal_submit_inode_data_buffers to ocfs2 jbd2_journal_submit_inode_data_buffers is only used by ocfs2, so move it there to prepare for removing generic_writepages. Link: https://lkml.kernel.org/r/20221229161031.391878-5-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 25 ------------------------- fs/jbd2/journal.c | 1 - fs/ocfs2/journal.c | 16 +++++++++++++++- include/linux/jbd2.h | 2 -- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 96a1ebc6342d..b33155dd7001 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -177,31 +177,6 @@ static int journal_wait_on_commit_record(journal_t *journal, return ret; } -/* - * write the filemap data using writepage() address_space_operations. - * We don't do block allocation here even for delalloc. We don't - * use writepages() because with delayed allocation we may be doing - * block allocation in writepages(). - */ -int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) -{ - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - /* - * submit the inode data buffers. We use writepage - * instead of writepages. Because writepages can do - * block allocation with delalloc. We need to write - * only allocated blocks here. - */ - return generic_writepages(mapping, &wbc); -} - /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4095fe91457f..e80c781731f8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,7 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); -EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3fb98b4569a2..59f612684c51 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -841,6 +842,19 @@ bail: return status; } +static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + + return generic_writepages(mapping, &wbc); +} + int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; @@ -910,7 +924,7 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) journal->j_journal = j_journal; journal->j_journal->j_submit_inode_data_buffers = - jbd2_journal_submit_inode_data_buffers; + ocfs2_journal_submit_inode_data_buffers; journal->j_journal->j_finish_inode_data_buffers = jbd2_journal_finish_inode_data_buffers; journal->j_inode = inode; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2170e0cc279d..5962072a4b19 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1570,8 +1570,6 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle, extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); -extern int jbd2_journal_submit_inode_data_buffers( - struct jbd2_inode *jinode); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, From 17c30ee6f2670804148f23b19b5de8308a02bd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:30 -1000 Subject: [PATCH 090/505] ocfs2: use filemap_fdatawrite_wbc instead of generic_writepages filemap_fdatawrite_wbc is a fairly thing wrapper around do_writepages, and the big difference there is support for cgroup writeback, which is not supported by ocfs2, and the potential to use ->writepages instead of ->writepage, which ocfs2 does not currently implement but eventually should. Link: https://lkml.kernel.org/r/20221229161031.391878-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 59f612684c51..25d8072ccfce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -852,7 +852,7 @@ static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_end = jinode->i_dirty_end, }; - return generic_writepages(mapping, &wbc); + return filemap_fdatawrite_wbc(mapping, &wbc); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) From c2ca7a59a4199059556b57cfdf98fcf46039ca6b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Dec 2022 06:10:31 -1000 Subject: [PATCH 091/505] mm: remove generic_writepages Now that all external callers are gone, just fold it into do_writepages. Link: https://lkml.kernel.org/r/20221229161031.391878-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Joel Becker Cc: Joseph Qi Cc: Konstantin Komarov Cc: Mark Fasheh Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/writeback.h | 2 -- mm/page-writeback.c | 53 +++++++++++---------------------------- 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06f9291b6fd5..2554b71765e9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -369,8 +369,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 41128ea9c997..337cafe9978c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2526,12 +2526,8 @@ continue_unlock: } EXPORT_SYMBOL(write_cache_pages); -/* - * Function used by generic_writepages to call the real writepage - * function and set the mapping flags on error - */ -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) +static int writepage_cb(struct page *page, struct writeback_control *wbc, + void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); @@ -2539,34 +2535,6 @@ static int __writepage(struct page *page, struct writeback_control *wbc, return ret; } -/** - * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * - * This is a library function, which implements the writepages() - * address_space_operation. - * - * Return: %0 on success, negative error code otherwise - */ -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct blk_plug plug; - int ret; - - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); - return ret; -} - -EXPORT_SYMBOL(generic_writepages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -2577,11 +2545,20 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { - if (mapping->a_ops->writepages) + if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); - if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + } else if (mapping->a_ops->writepage) { + struct blk_plug plug; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, writepage_cb, + mapping); + blk_finish_plug(&plug); + } else { + /* deal with chardevs and other special files */ + ret = 0; + } + if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; /* From 630e7c5ee3399be509447035428994e2d88f12c1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 29 Dec 2022 20:25:03 +0800 Subject: [PATCH 092/505] mm: huge_memory: convert split_huge_pages_all() to use a folio Straightforwardly convert split_huge_pages_all() to use a folio. Link: https://lkml.kernel.org/r/20221229122503.149083-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Mike Kravetz Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0ea6510a3be7..7e68a36b4f7d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2935,6 +2935,7 @@ static void split_huge_pages_all(void) { struct zone *zone; struct page *page; + struct folio *folio; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; @@ -2947,24 +2948,32 @@ static void split_huge_pages_all(void) int nr_pages; page = pfn_to_online_page(pfn); - if (!page || !get_page_unless_zero(page)) + if (!page || PageTail(page)) + continue; + folio = page_folio(page); + if (!folio_try_get(folio)) continue; - if (zone != page_zone(page)) + if (unlikely(page_folio(page) != folio)) goto next; - if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) + if (zone != folio_zone(folio)) + goto next; + + if (!folio_test_large(folio) + || folio_test_hugetlb(folio) + || !folio_test_lru(folio)) goto next; total++; - lock_page(page); - nr_pages = thp_nr_pages(page); - if (!split_huge_page(page)) + folio_lock(folio); + nr_pages = folio_nr_pages(folio); + if (!split_folio(folio)) split++; pfn += nr_pages - 1; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); cond_resched(); } } From 071acb3084c5e04db3ff09865e4030aefaa2ab92 Mon Sep 17 00:00:00 2001 From: JeongHyeon Lee Date: Fri, 23 Dec 2022 13:03:31 +0900 Subject: [PATCH 093/505] zram: fix typos in comments - The double `range` is duplicated in comment, remove one. - change `syfs` to `sysfs` Link: https://lkml.kernel.org/r/20221223040331.4194-1-jhs2.lee@samsung.com Signed-off-by: JeongHyeon Lee Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e290d6d97047..7becd5448791 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -190,7 +190,7 @@ static inline bool valid_io_request(struct zram *zram, end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ + /* out of range */ if (unlikely(start >= bound || end > bound || start > end)) return false; @@ -2385,7 +2385,7 @@ static int zram_add(void) zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - /* Actual capacity set using syfs (/sys/block/zram/disksize */ + /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); From becacb04fdd439d7d1f2a93739161706a2e3e947 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 30 Dec 2022 15:08:42 +0800 Subject: [PATCH 094/505] mm: memcg: add folio_memcg_check() Patch series "mm: convert page_idle/damon to use folios", v4. This patch (of 8): Convert page_memcg_check() into folio_memcg_check() and add a page_memcg_check() wrapper. The behaviour of page_memcg_check() is unchanged; tail pages always had a NULL ->memcg_data. Link: https://lkml.kernel.org/r/20221230070849.63358-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221230070849.63358-2-wangkefeng.wang@huawei.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: SeongJae Park Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 40 +++++++++++++++++++++++++------------- mm/memcontrol.c | 6 +++--- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2e08b05bc6bf..26667bf16da5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -466,34 +466,34 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) } /* - * page_memcg_check - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg_check - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page - * as an argument. It has to be used in cases when it's not known if a page + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function unlike folio_memcg() can take any folio + * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - lock_folio_memcg() * - exclusive reference * - mem_cgroup_trylock_pages() * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ -static inline struct mem_cgroup *page_memcg_check(struct page *page) +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* - * Because page->memcg_data might be changed asynchronously - * for slab pages, READ_ONCE() should be used here. + * Because folio->memcg_data might be changed asynchronously + * for slabs, READ_ONCE() should be used here. */ - unsigned long memcg_data = READ_ONCE(page->memcg_data); + unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; @@ -508,6 +508,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + if (PageTail(page)) + return NULL; + return folio_memcg_check((struct folio *)page); +} + static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; @@ -1170,6 +1177,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return NULL; } +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2758b67eb169..17965e558ab7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2952,13 +2952,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) } /* - * page_memcg_check() is used here, because in theory we can encounter + * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but * slab->memcg_data has not been freed yet - * page_memcg_check(page) will guarantee that a proper memory + * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ - return page_memcg_check(folio_page(folio, 0)); + return folio_memcg_check(folio); } /* From 5acc17fd35e62780a14e4198deb2a6d1d57aa372 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:43 +0800 Subject: [PATCH 095/505] mm: page_idle: convert page idle to use a folio Firstly, make page_idle_get_page() return a folio, also rename it to page_idle_get_folio(), then, use it to convert page_idle_bitmap_read() and page_idle_bitmap_write() functions. Link: https://lkml.kernel.org/r/20221230070849.63358-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_idle.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index bc08332a609c..41ea77f22011 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -31,19 +31,22 @@ * * This function tries to get a user memory page by pfn as described above. */ -static struct page *page_idle_get_page(unsigned long pfn) +static struct folio *page_idle_get_folio(unsigned long pfn) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; - if (!page || !PageLRU(page) || - !get_page_unless_zero(page)) + if (!page || PageTail(page)) return NULL; - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; + folio = page_folio(page); + if (!folio_test_lru(folio) || !folio_try_get(folio)) + return NULL; + if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) { + folio_put(folio); + folio = NULL; } - return page; + return folio; } static bool page_idle_clear_pte_refs_one(struct folio *folio, @@ -83,10 +86,8 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, return true; } -static void page_idle_clear_pte_refs(struct page *page) +static void page_idle_clear_pte_refs(struct folio *folio) { - struct folio *folio = page_folio(page); - /* * Since rwc.try_lock is unused, rwc is effectively immutable, so we * can make it static to save some cycles and stack. @@ -115,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, loff_t pos, size_t count) { u64 *out = (u64 *)buf; - struct page *page; + struct folio *folio; unsigned long pfn, end_pfn; int bit; @@ -134,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, bit = pfn % BITMAP_CHUNK_BITS; if (!bit) *out = 0ULL; - page = page_idle_get_page(pfn); - if (page) { - if (page_is_idle(page)) { + folio = page_idle_get_folio(pfn); + if (folio) { + if (folio_test_idle(folio)) { /* * The page might have been referenced via a * pte, in which case it is not idle. Clear * refs and recheck. */ - page_idle_clear_pte_refs(page); - if (page_is_idle(page)) + page_idle_clear_pte_refs(folio); + if (folio_test_idle(folio)) *out |= 1ULL << bit; } - put_page(page); + folio_put(folio); } if (bit == BITMAP_CHUNK_BITS - 1) out++; @@ -160,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; - struct page *page; + struct folio *folio; unsigned long pfn, end_pfn; int bit; @@ -178,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; if ((*in >> bit) & 1) { - page = page_idle_get_page(pfn); - if (page) { - page_idle_clear_pte_refs(page); - set_page_idle(page); - put_page(page); + folio = page_idle_get_folio(pfn); + if (folio) { + page_idle_clear_pte_refs(folio); + folio_set_idle(folio); + folio_put(folio); } } if (bit == BITMAP_CHUNK_BITS - 1) From 5e012bba019afa6aca74df19751783a47d16ebf7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:44 +0800 Subject: [PATCH 096/505] mm/damon: introduce damon_get_folio() Introduce damon_get_folio(), and the temporary wrapper function damon_get_page(), which help us to convert damon related functions to use folios, and it will be dropped once the conversion is completed. Link: https://lkml.kernel.org/r/20221230070849.63358-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 18 +++++++++++------- mm/damon/ops-common.h | 9 ++++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 75409601f934..1294a256a87c 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -16,21 +16,25 @@ * Get an online page for a pfn if it's in the LRU list. Otherwise, returns * NULL. * - * The body of this function is stolen from the 'page_idle_get_page()'. We + * The body of this function is stolen from the 'page_idle_get_folio()'. We * steal rather than reuse it because the code is quite simple. */ -struct page *damon_get_page(unsigned long pfn) +struct folio *damon_get_folio(unsigned long pfn) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; - if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + if (!page || PageTail(page)) return NULL; - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; + folio = page_folio(page); + if (!folio_test_lru(folio) || !folio_try_get(folio)) + return NULL; + if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) { + folio_put(folio); + folio = NULL; } - return page; + return folio; } void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 8d82d3722204..65f290f0a9d6 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -7,7 +7,14 @@ #include -struct page *damon_get_page(unsigned long pfn); +struct folio *damon_get_folio(unsigned long pfn); +static inline struct page *damon_get_page(unsigned long pfn) +{ + struct folio *folio = damon_get_folio(pfn); + + /* when folio is NULL, return &(0->page) mean return NULL */ + return &folio->page; +} void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); From 70e314c9ab4faf810fc088503a37fb3b126d09e2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:45 +0800 Subject: [PATCH 097/505] mm/damon: convert damon_ptep/pmdp_mkold() to use a folio With damon_get_folio(), let's convert damon_ptep_mkold() and damon_pmdp_mkold() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 1294a256a87c..cc63cf953636 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -40,9 +40,9 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) { bool referenced = false; - struct page *page = damon_get_page(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(*pte)); - if (!page) + if (!folio) return; if (pte_young(*pte)) { @@ -56,19 +56,19 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); } void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool referenced = false; - struct page *page = damon_get_page(pmd_pfn(*pmd)); + struct folio *folio = damon_get_folio(pmd_pfn(*pmd)); - if (!page) + if (!folio) return; if (pmd_young(*pmd)) { @@ -82,10 +82,10 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } From 07bb1fbaa2bbd2a387bcc1171d8aee1a96178b45 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:46 +0800 Subject: [PATCH 098/505] mm/damon/paddr: convert damon_pa_*() to use a folio With damon_get_folio(), let's convert all the damon_pa_*() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 58 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6334c99e5152..99d4c357ef2b 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -33,17 +33,15 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, static void damon_pa_mkold(unsigned long paddr) { - struct folio *folio; - struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { .rmap_one = __damon_pa_mkold, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!page) + if (!folio) return; - folio = page_folio(page); if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); @@ -122,8 +120,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) { - struct folio *folio; - struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { .page_sz = PAGE_SIZE, .accessed = false, @@ -135,9 +132,8 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) }; bool need_lock; - if (!page) + if (!folio) return false; - folio = page_folio(page); if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) @@ -203,18 +199,18 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) } static bool __damos_pa_filter_out(struct damos_filter *filter, - struct page *page) + struct folio *folio) { bool matched = false; struct mem_cgroup *memcg; switch (filter->type) { case DAMOS_FILTER_TYPE_ANON: - matched = PageAnon(page); + matched = folio_test_anon(folio); break; case DAMOS_FILTER_TYPE_MEMCG: rcu_read_lock(); - memcg = page_memcg_check(page); + memcg = folio_memcg_check(folio); if (!memcg) matched = false; else @@ -231,12 +227,12 @@ static bool __damos_pa_filter_out(struct damos_filter *filter, /* * damos_pa_filter_out - Return true if the page should be filtered out. */ -static bool damos_pa_filter_out(struct damos *scheme, struct page *page) +static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, page)) + if (__damos_pa_filter_out(filter, folio)) return true; } return false; @@ -245,33 +241,33 @@ static bool damos_pa_filter_out(struct damos *scheme, struct page *page) static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; - LIST_HEAD(page_list); + LIST_HEAD(folio_list); for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!page) + if (!folio) continue; - if (damos_pa_filter_out(s, page)) { - put_page(page); + if (damos_pa_filter_out(s, folio)) { + folio_put(folio); continue; } - ClearPageReferenced(page); - test_and_clear_page_young(page); - if (isolate_lru_page(page)) { - put_page(page); + folio_clear_referenced(folio); + folio_test_clear_young(folio); + if (folio_isolate_lru(folio)) { + folio_put(folio); continue; } - if (PageUnevictable(page)) { - putback_lru_page(page); + if (folio_test_unevictable(folio)) { + folio_putback_lru(folio); } else { - list_add(&page->lru, &page_list); - put_page(page); + list_add(&folio->lru, &folio_list); + folio_put(folio); } } - applied = reclaim_pages(&page_list); + applied = reclaim_pages(&folio_list); cond_resched(); return applied * PAGE_SIZE; } @@ -282,14 +278,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( unsigned long addr, applied = 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); - struct folio *folio; + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!page) + if (!folio) continue; - folio = page_folio(page); - if (damos_pa_filter_out(s, &folio->page)) { + if (damos_pa_filter_out(s, folio)) { folio_put(folio); continue; } From dc1b78665b37ec50fc142a7f1a8fa4f626cd6a58 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:47 +0800 Subject: [PATCH 099/505] mm/damon/vaddr: convert damon_young_pmd_entry() to use a folio With damon_get_folio(), let's convert damon_young_pmd_entry() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 15f03df66db6..29227b7a6032 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -431,7 +431,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, { pte_t *pte; spinlock_t *ptl; - struct page *page; + struct folio *folio; struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -446,16 +446,16 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); goto regular_page; } - page = damon_get_page(pmd_pfn(*pmd)); - if (!page) + folio = damon_get_folio(pmd_pfn(*pmd)); + if (!folio) goto huge_out; - if (pmd_young(*pmd) || !page_is_idle(page) || + if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = HPAGE_PMD_SIZE; priv->young = true; } - put_page(page); + folio_put(folio); huge_out: spin_unlock(ptl); return 0; @@ -469,15 +469,15 @@ regular_page: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte_present(*pte)) goto out; - page = damon_get_page(pte_pfn(*pte)); - if (!page) + folio = damon_get_folio(pte_pfn(*pte)); + if (!folio) goto out; - if (pte_young(*pte) || !page_is_idle(page) || + if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = PAGE_SIZE; priv->young = true; } - put_page(page); + folio_put(folio); out: pte_unmap_unlock(pte, ptl); return 0; From 7824debb3d029e6a6252137fd10f3553cc8de22a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:48 +0800 Subject: [PATCH 100/505] mm/damon: remove unneeded damon_get_page() After all damon_get_page() callers are converted to damon_get_folio(), remove unneeded wrapper damon_get_page(). Link: https://lkml.kernel.org/r/20221230070849.63358-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/ops-common.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 65f290f0a9d6..14f4bc69f29b 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -8,13 +8,6 @@ #include struct folio *damon_get_folio(unsigned long pfn); -static inline struct page *damon_get_page(unsigned long pfn) -{ - struct folio *folio = damon_get_folio(pfn); - - /* when folio is NULL, return &(0->page) mean return NULL */ - return &folio->page; -} void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); From 6b7cea90c82e104c1151fec1f3ee216997fda652 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Dec 2022 15:08:49 +0800 Subject: [PATCH 101/505] mm/damon/vaddr: convert hugetlb related functions to use a folio Convert damon_hugetlb_mkold() and damon_young_hugetlb_entry() to use a folio. Link: https://lkml.kernel.org/r/20221230070849.63358-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 29227b7a6032..9d92c5eb3a1f 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -335,9 +335,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, { bool referenced = false; pte_t entry = huge_ptep_get(pte); - struct page *page = pte_page(entry); + struct folio *folio = pfn_folio(pte_pfn(entry)); - get_page(page); + folio_get(folio); if (pte_young(entry)) { referenced = true; @@ -352,10 +352,10 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, #endif /* CONFIG_MMU_NOTIFIER */ if (referenced) - set_page_young(page); + folio_set_young(folio); - set_page_idle(page); - put_page(page); + folio_set_idle(folio); + folio_put(folio); } static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, @@ -490,7 +490,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, { struct damon_young_walk_private *priv = walk->private; struct hstate *h = hstate_vma(walk->vma); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -499,16 +499,16 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto out; - page = pte_page(entry); - get_page(page); + folio = pfn_folio(pte_pfn(entry)); + folio_get(folio); - if (pte_young(entry) || !page_is_idle(page) || + if (pte_young(entry) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { *priv->page_sz = huge_page_size(h); priv->young = true; } - put_page(page); + folio_put(folio); out: spin_unlock(ptl); From a79390f5d6a78647fd70856bd42b22d994de0ba2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:06 -0500 Subject: [PATCH 102/505] mm/mprotect: use long for page accountings and retval Switch to use type "long" for page accountings and retval across the whole procedure of change_protection(). The change should have shrinked the possible maximum page number to be half comparing to previous (ULONG_MAX / 2), but it shouldn't overflow on any system either because the maximum possible pages touched by change protection should be ULONG_MAX / PAGE_SIZE. Two reasons to switch from "unsigned long" to "long": 1. It suites better on count_vm_numa_events(), whose 2nd parameter takes a long type. 2. It paves way for returning negative (error) values in the future. Currently the only caller that consumes this retval is change_prot_numa(), where the unsigned long was converted to an int. Since at it, touching up the numa code to also take a long, so it'll avoid any possible overflow too during the int-size convertion. Link: https://lkml.kernel.org/r/20230104225207.1066932-3-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Mike Kravetz Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- include/linux/mm.h | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 26 +++++++++++++------------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b6b10101bea7..e3aa336df900 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -248,7 +248,7 @@ void hugetlb_vma_lock_release(struct kref *kref); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); @@ -437,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio, { } -static inline unsigned long hugetlb_change_protection( +static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 329ed67edd76..4ac5ea4b584c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82f41024167..a0d6d0980064 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6615,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -6624,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); - unsigned long pages = 0, psize = huge_page_size(h); + long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d3558248a0f0..a86b8f15e2f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -631,7 +631,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; - int nr_updated; + long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 71358e45a742..0af22ab59ea8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, return pte_dirty(pte); } -static unsigned long change_pte_range(struct mmu_gather *tlb, +static long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; - unsigned long pages = 0; + long pages = 0; int target_node = NUMA_NO_NODE; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -353,13 +353,13 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) } \ } while (0) -static inline unsigned long change_pmd_range(struct mmu_gather *tlb, +static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; - unsigned long pages = 0; + long pages = 0; unsigned long nr_huge_updates = 0; struct mmu_notifier_range range; @@ -367,7 +367,7 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; + long this_pages; next = pmd_addr_end(addr, end); @@ -437,13 +437,13 @@ next: return pages; } -static inline unsigned long change_pud_range(struct mmu_gather *tlb, +static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; - unsigned long pages = 0; + long pages = 0; pud = pud_offset(p4d, addr); do { @@ -458,13 +458,13 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb, return pages; } -static inline unsigned long change_p4d_range(struct mmu_gather *tlb, +static inline long change_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; - unsigned long pages = 0; + long pages = 0; p4d = p4d_offset(pgd, addr); do { @@ -479,14 +479,14 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb, return pages; } -static unsigned long change_protection_range(struct mmu_gather *tlb, +static long change_protection_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - unsigned long pages = 0; + long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -505,12 +505,12 @@ static unsigned long change_protection_range(struct mmu_gather *tlb, return pages; } -unsigned long change_protection(struct mmu_gather *tlb, +long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags) { pgprot_t newprot = vma->vm_page_prot; - unsigned long pages; + long pages; BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); From d1751118c88673fe5a948ad82277898e9e284c55 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Jan 2023 17:52:07 -0500 Subject: [PATCH 103/505] mm/uffd: detect pgtable allocation failures Before this patch, when there's any pgtable allocation issues happened during change_protection(), the error will be ignored from the syscall. For shmem, there will be an error dumped into the host dmesg. Two issues with that: (1) Doing a trace dump when allocation fails is not anything close to grace. (2) The user should be notified with any kind of such error, so the user can trap it and decide what to do next, either by retrying, or stop the process properly, or anything else. For userfault users, this will change the API of UFFDIO_WRITEPROTECT when pgtable allocation failure happened. It should not normally break anyone, though. If it breaks, then in good ways. One man-page update will be on the way to introduce the new -ENOMEM for UFFDIO_WRITEPROTECT. Not marking stable so we keep the old behavior on the 5.19-till-now kernels. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230104225207.1066932-4-peterx@redhat.com Signed-off-by: Peter Xu Reported-by: James Houghton Acked-by: James Houghton Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 2 +- mm/hugetlb.c | 6 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 63 +++++++++++++++++++++++------------ mm/userfaultfd.c | 16 ++++++--- 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc..3767f18114ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d6d0980064..6fe65f14d33b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6660,8 +6660,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); - if (!ptep) + if (!ptep) { + pages = -ENOMEM; break; + } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { @@ -6751,7 +6753,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); - return pages << h->order; + return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a86b8f15e2f0..85a34f1f3ab8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,7 +636,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated) + if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0af22ab59ea8..92fc6f3fa512 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -330,28 +330,34 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) /* * If wr-protecting the range for file-backed, populate pgtable for the case * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() - * failed it means no memory, we don't have a better option but stop. + * failed we treat it the same way as pgtable allocation failures during + * page faults by kicking OOM and returning error. */ #define change_pmd_prepare(vma, pmd, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ - if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ - break; \ + if (pte_alloc(vma->vm_mm, pmd)) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) + /* * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. */ #define change_prepare(vma, high, low, addr, cp_flags) \ - do { \ + ({ \ + long err = 0; \ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ - if (WARN_ON_ONCE(p == NULL)) \ - break; \ + if (p == NULL) \ + err = -ENOMEM; \ } \ - } while (0) + err; \ + }) static inline long change_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, @@ -367,11 +373,15 @@ static inline long change_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { - long this_pages; + long ret; next = pmd_addr_end(addr, end); - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } /* * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur @@ -401,7 +411,11 @@ static inline long change_pmd_range(struct mmu_gather *tlb, * cleared; make sure pmd populated if * necessary, then fall-through to pte level. */ - change_pmd_prepare(vma, pmd, cp_flags); + ret = change_pmd_prepare(vma, pmd, cp_flags); + if (ret) { + pages = ret; + break; + } } else { /* * change_huge_pmd() does not defer TLB flushes, @@ -422,9 +436,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb, } /* fall through, the trans huge pmd just split */ } - this_pages = change_pte_range(tlb, vma, pmd, addr, next, - newprot, cp_flags); - pages += this_pages; + pages += change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -443,12 +456,14 @@ static inline long change_pud_range(struct mmu_gather *tlb, { pud_t *pud; unsigned long next; - long pages = 0; + long pages = 0, ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - change_prepare(vma, pud, pmd, addr, cp_flags); + ret = change_prepare(vma, pud, pmd, addr, cp_flags); + if (ret) + return ret; if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, @@ -464,12 +479,14 @@ static inline long change_p4d_range(struct mmu_gather *tlb, { p4d_t *p4d; unsigned long next; - long pages = 0; + long pages = 0, ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - change_prepare(vma, p4d, pud, addr, cp_flags); + ret = change_prepare(vma, p4d, pud, addr, cp_flags); + if (ret) + return ret; if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, @@ -486,14 +503,18 @@ static long change_protection_range(struct mmu_gather *tlb, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; - long pages = 0; + long pages = 0, ret; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); tlb_start_vma(tlb, vma); do { next = pgd_addr_end(addr, end); - change_prepare(vma, pgd, p4d, addr, cp_flags); + ret = change_prepare(vma, pgd, p4d, addr, cp_flags); + if (ret) { + pages = ret; + break; + } if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 65ad172add27..53c3d916ff66 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -710,11 +710,12 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } -void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, +long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; + long ret; if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; @@ -730,8 +731,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); + ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); + + return ret; } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, @@ -740,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, { struct vm_area_struct *dst_vma; unsigned long page_mask; - int err; + long err; /* * Sanitize the command parameters: @@ -779,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + + /* Return 0 on success, <0 on failures */ + if (err > 0) + err = 0; - err = 0; out_unlock: mmap_read_unlock(dst_mm); return err; From f78dfc7b77d5c3527d0f895bef693f711802de5a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Jan 2023 14:29:44 -0800 Subject: [PATCH 104/505] workingset: fix confusion around eviction vs refault container Refault decisions are made based on the lruvec where the page was evicted, as that determined its LRU order while it was alive. Stats and workingset aging must then occur on the lruvec of the new page, as that's the node and cgroup that experience the refault and that's the lruvec whose nonresident info ages out by a new resident page. Those lruvecs could be different when a page is shared between cgroups, or the refaulting page is allocated on a different node. There are currently two mix-ups: 1. When swap is available, the resident anon set must be considered when comparing the refault distance. The comparison is made against the right anon set, but the check for swap is not. When pages get evicted from a cgroup with swap, and refault in one without, this can incorrectly consider a hot refault as cold - and vice versa. Fix that by using the eviction cgroup for the swap check. 2. The stats and workingset age are updated against the wrong lruvec altogether: the right cgroup but the wrong NUMA node. When a page refaults on a different NUMA node, this will have confusing stats and distort the workingset age on a different lruvec - again possibly resulting in hot/cold misclassifications down the line. Fix the swap check and the refault pgdat to address both concerns. This was found during code review. It hasn't caused notable issues in production, suggesting that those refault-migrations are relatively rare in practice. Link: https://lkml.kernel.org/r/20230104222944.2380117-1-nphamcs@gmail.com Signed-off-by: Johannes Weiner Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/workingset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index fd666584515c..f194d13beabb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -457,6 +457,7 @@ void workingset_refault(struct folio *folio, void *shadow) */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); @@ -474,7 +475,7 @@ void workingset_refault(struct folio *folio, void *shadow) workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } - if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { From fc5744881eabcc73ed24a4229034f0fbdeb3f46f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 4 Jan 2023 21:18:05 +0200 Subject: [PATCH 105/505] mm/page_alloc: invert logic for early page initialisation checks Rename early_page_uninitialised() to early_page_initialised() and invert its logic to make the code more readable. Link: https://lkml.kernel.org/r/20230104191805.2535864-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5668c1a2de49..5514d84cc712 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -443,15 +443,15 @@ static inline bool deferred_pages_enabled(void) return static_branch_unlikely(&deferred_pages); } -/* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __meminit early_page_uninitialised(unsigned long pfn) +/* Returns true if the struct page for the pfn is initialised */ +static inline bool __meminit early_page_initialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; + return false; - return false; + return true; } /* @@ -498,9 +498,9 @@ static inline bool deferred_pages_enabled(void) return false; } -static inline bool early_page_uninitialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn) { - return false; + return true; } static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) @@ -1643,7 +1643,7 @@ static void __meminit init_reserved_page(unsigned long pfn) pg_data_t *pgdat; int nid, zid; - if (!early_page_uninitialised(pfn)) + if (early_page_initialised(pfn)) return; nid = early_pfn_to_nid(pfn); @@ -1806,7 +1806,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (early_page_uninitialised(pfn)) + if (!early_page_initialised(pfn)) return; if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ From 541e06b772c1aaffb3b6a245ccface36d7107af2 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Thu, 5 Jan 2023 16:05:34 +0000 Subject: [PATCH 106/505] maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk() Preallocations are common in the VMA code to avoid allocating under certain locking conditions. The preallocations must also cover the worst-case scenario. Removing the GFP_ZERO flag from the kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time spent zeroing memory that may not be used. Only zero out the necessary area to keep track of the allocations in the maple state. Zero the entire node prior to using it in the tree. This required internal changes to node counting on allocation, so the test code is also updated. This restores some micro-benchmark performance: up to +9% in mmtests mmap1 by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by Red Hat Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636 Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com Signed-off-by: Liam Howlett Reported-by: Jirka Hladky Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/maple_tree.c | 80 +++++++++++++++++--------------- tools/testing/radix-tree/maple.c | 18 +++---- 2 files changed, 52 insertions(+), 46 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 94f0053ec3e0..8db3c336d19f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -149,13 +149,12 @@ struct maple_subtree_state { /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { - return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); + return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, - nodes); + return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) @@ -1125,9 +1124,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); + unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ - if (unlikely(!total)) + if (WARN_ON(!total)) return NULL; if (total == 1) { @@ -1137,27 +1137,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; } - if (!node->node_count) { + if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; - node->slot[0] = NULL; mas->alloc->total = node->total - 1; ret = node; goto new_head; } - node->total--; - ret = node->slot[node->node_count]; - node->slot[node->node_count--] = NULL; + ret = node->slot[--node->node_count]; + node->slot[node->node_count] = NULL; single_node: new_head: - ret->total = 0; - ret->node_count = 0; - if (ret->request_count) { - mas_set_alloc_req(mas, ret->request_count + 1); - ret->request_count = 0; + if (req) { + req++; + mas_set_alloc_req(mas, req); } + + memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } @@ -1176,21 +1174,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas); - memset(reuse, 0, sizeof(*reuse)); count = mas_allocated(mas); - if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { - if (head->slot[0]) - head->node_count++; - head->slot[head->node_count] = reuse; + reuse->request_count = 0; + reuse->node_count = 0; + if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { + head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { - head->request_count = 0; reuse->slot[0] = head; + reuse->node_count = 1; reuse->total += head->total; } @@ -1209,7 +1206,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); - unsigned long success = allocated; unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; @@ -1225,24 +1221,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); } - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; - if (allocated) + if (allocated) { node->slot[0] = mas->alloc; + node->node_count = 1; + } else { + node->node_count = 0; + } - success++; mas->alloc = node; + node->total = ++allocated; requested--; } node = mas->alloc; + node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS; - if (node->slot[0]) { - unsigned int offset = node->node_count + 1; + if (node->node_count) { + unsigned int offset = node->node_count; slots = (void **)&node->slot[offset]; max_req -= offset; @@ -1256,15 +1257,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk; node->node_count += count; - /* zero indexed. */ - if (slots == (void **)&node->slot) - node->node_count--; - - success += count; + allocated += count; node = node->slot[0]; + node->node_count = 0; + node->request_count = 0; requested -= count; } - mas->alloc->total = success; + mas->alloc->total = allocated; return; nomem_bulk: @@ -1273,7 +1272,7 @@ nomem_bulk: nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = success; + mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } @@ -5734,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; + unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, @@ -5756,14 +5756,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); - while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + total = mas_allocated(mas); + while (total) { node = mas->alloc; mas->alloc = node->slot[0]; - if (node->node_count > 0) - mt_free_bulk(node->node_count, - (void __rcu **)&node->slot[1]); + if (node->node_count > 1) { + size_t count = node->node_count - 1; + + mt_free_bulk(count, (void __rcu **)&node->slot[1]); + total -= count; + } kmem_cache_free(maple_node_cache, node); + total--; } + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 81fa7ec2e66a..1f36bc1c5d36 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -173,11 +173,11 @@ static noinline void check_new_node(struct maple_tree *mt) if (!MAPLE_32BIT) { if (i >= 35) - e = i - 35; + e = i - 34; else if (i >= 5) - e = i - 5; + e = i - 4; else if (i >= 2) - e = i - 2; + e = i - 1; } else { if (i >= 4) e = i - 4; @@ -305,17 +305,17 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); /* Check the limit of pop/push/pop */ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ @@ -323,14 +323,14 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); From 9eefefd835e451d340f5e95bc14ffd68b9b99268 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 5 Jan 2023 13:04:24 +0100 Subject: [PATCH 107/505] mm: remove an ambiguous sentence from kmap_local_folio() kdocs In the kdocs of kmap_local_folio() there is a an ambiguous sentence which suggests to use this API "only when really necessary". On the contrary, since kmap() and kmap_atomic() are deprecated, both kmap_local_folio(), as well as kmap_local_page(), must be preferred to the previous ones. Therefore, remove the above-mentioned sentence exactly how it has previously been done for the kmap_local_page() kdocs in commit 72f1c55adf70 ("highmem: delete a sentence from kmap_local_page() kdocs"). Link: https://lkml.kernel.org/r/20230105120424.30055-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Ira Weiny Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44242268f53b..daeb0d8e753a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -119,9 +119,8 @@ static inline void *kmap_local_page(struct page *page); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. Only use when really - * necessary. + * While it is significantly faster than kmap() for the highmem case it + * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across From 1f8549fce525bc95df40ea3ddbfc6e8e719d188d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Thu, 5 Jan 2023 13:13:05 +0100 Subject: [PATCH 108/505] mm: fix spelling mistake in highmem.h Substitute "higmem" with "highmem" in highmem.h. Link: https://lkml.kernel.org/r/20230105121305.30714-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: "Matthew Wilcox (Oracle)" Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index daeb0d8e753a..d7097b8158f2 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -86,8 +86,8 @@ static inline void kmap_flush_unused(void); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. + * While kmap_local_page() is significantly faster than kmap() for the highmem + * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across From dee2ad120571f38433211098cd6b95a59bdfc8c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Jan 2023 15:49:05 +0100 Subject: [PATCH 109/505] selftests/vm: cow: add COW tests for collapsing of PTE-mapped anon THP Currently, anonymous PTE-mapped THPs cannot be collapsed in-place: collapsing (e.g., via MADV_COLLAPSE) implies allocating a fresh THP and mapping that new THP via a PMD: as it's a fresh anon THP, it will get the exclusive flag set on the head page and everybody is happy. However, if the kernel would ever support in-place collapse of anonymous THPs (replacing a page table mapping each sub-page of a THP via PTEs with a single PMD mapping the complete THP), exclusivity information stored for each sub-page would have to be collapsed accordingly: (1) All PTEs map !exclusive anon sub-pages: the in-place collapsed THP must not not have the exclusive flag set on the head page mapped by the PMD. This is the easiest case to handle ("simply don't set any exclusive flags"). (2) All PTEs map exclusive anon sub-pages: when collapsing, we have to clear the exclusive flag from all tail pages and only leave the exclusive flag set for the head page. Otherwise, fork() after collapse would not clear the exclusive flags from the tail pages and we'd be in trouble once PTE-mapping the shared THP when writing to shared tail pages that still have the exclusive flag set. This would effectively revert what the PTE-mapping code does when propagating the exclusive flag to all sub-pages. (3) PTEs map a mixture of exclusive and !exclusive anon sub-pages (can happen e.g., due to MADV_DONTFORK before fork()). We must not collapse the THP in-place, otherwise bad things may happen: the exclusive flags of sub-pages would get ignored and the exclusive flag of the head page would get used instead. Now that we have MADV_COLLAPSE in place to trigger collapsing a THP, let's add some test cases that would bail out early, if we'd voluntarily/accidantially unlock in-place collapse for anon THPs and forget about taking proper care of exclusive flags. Running the test on a kernel with MADV_COLLAPSE support: # [INFO] Anonymous THP tests # [RUN] Basic COW after fork() when collapsing before fork() ok 169 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (fully shared) ok 170 # SKIP MADV_COLLAPSE failed: Invalid argument # [RUN] Basic COW after fork() when collapsing after fork() (lower shared) ok 171 No leak from parent into child # [RUN] Basic COW after fork() when collapsing after fork() (upper shared) ok 172 No leak from parent into child For now, MADV_COLLAPSE always seems to fail if all PTEs map shared sub-pages. Link: https://lkml.kernel.org/r/20230104144905.460075-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Hugh Dickins Cc: Peter Xu Cc: Vlastimil Babka Cc: Nadav Amit Cc: Zach O'Keefe Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 228 +++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index 26f6ea3079e2..16216d893d96 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -30,6 +30,10 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + static size_t pagesize; static int pagemap_fd; static size_t thpsize; @@ -1178,6 +1182,228 @@ static int tests_per_anon_test_case(void) return tests; } +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); static void test_cow(char *mem, const char *smem, size_t size) @@ -1518,6 +1744,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); @@ -1526,6 +1753,7 @@ int main(int argc, char **argv) ksft_exit_fail_msg("opening pagemap failed\n"); run_anon_test_cases(); + run_anon_thp_test_cases(); run_non_anon_test_cases(); err = ksft_get_fail_cnt(); From bb94429096d090f5e209624d08919a7962d6c4b7 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 4 Jan 2023 14:06:04 +0800 Subject: [PATCH 110/505] mm/slab: add is_kmalloc_cache() helper function commit 6edf2576a6cc ("mm/slub: enable debugging memory wasting of kmalloc") introduces 'SLAB_KMALLOC' bit specifying whether a kmem_cache is a kmalloc cache for slab/slub (slob doesn't have dedicated kmalloc caches). Add a helper inline function for other components like kasan to simplify code. Link: https://lkml.kernel.org/r/20230104060605.930910-1-feng.tang@intel.com Signed-off-by: Feng Tang Acked-by: Vlastimil Babka Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/slab.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/slab.h b/mm/slab.h index 7cc432969945..63fb4c00d529 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -323,6 +323,14 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, } #endif +static inline bool is_kmalloc_cache(struct kmem_cache *s) +{ +#ifndef CONFIG_SLOB + return (s->flags & SLAB_KMALLOC); +#else + return false; +#endif +} /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ From bbc61844b4645d54c147a82654ac974bb7be85de Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 4 Jan 2023 14:06:05 +0800 Subject: [PATCH 111/505] mm/kasan: simplify and refine kasan_cache code struct 'kasan_cache' has a member 'is_kmalloc' indicating whether its host kmem_cache is a kmalloc cache. With newly introduced is_kmalloc_cache() helper, 'is_kmalloc' and its related function can be replaced and removed. Also 'kasan_cache' is only needed by KASAN generic mode, and not by SW/HW tag modes, so refine its protection macro accordingly, suggested by Andrey Konoval. Link: https://lkml.kernel.org/r/20230104060605.930910-2-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Andrey Konovalov Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 22 +++++----------------- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- mm/kasan/common.c | 9 ++------- mm/slab_common.c | 1 - 5 files changed, 9 insertions(+), 27 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5ebbaf672009..f7ef70661ce2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,15 +96,6 @@ static inline bool kasan_has_integrated_init(void) } #ifdef CONFIG_KASAN - -struct kasan_cache { -#ifdef CONFIG_KASAN_GENERIC - int alloc_meta_offset; - int free_meta_offset; -#endif - bool is_kmalloc; -}; - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -129,13 +120,6 @@ static __always_inline bool kasan_unpoison_pages(struct page *page, return false; } -void __kasan_cache_create_kmalloc(struct kmem_cache *cache); -static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) -{ - if (kasan_enabled()) - __kasan_cache_create_kmalloc(cache); -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -255,7 +239,6 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, { return false; } -static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -306,6 +289,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +struct kasan_cache { + int alloc_meta_offset; + int free_meta_offset; +}; + size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5834bad8ad78..a61e7d55d0d3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -72,7 +72,7 @@ struct kmem_cache { int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index aa0ee1678d29..f6df03f934e5 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -136,7 +136,7 @@ struct kmem_cache { unsigned int *random_seq; #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1d0008e1c420..6b8e9c848573 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -122,11 +122,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -void __kasan_cache_create_kmalloc(struct kmem_cache *cache) -{ - cache->kasan_info.is_kmalloc = true; -} - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); @@ -326,7 +321,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache)) kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; @@ -372,7 +367,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache)) kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 1cba98acc486..bf4e777cfe90 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -670,7 +670,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, usersize); - kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; From e9adcfecf572fcfaa9f8525904cf49c709974f73 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 3 Jan 2023 16:27:32 -0800 Subject: [PATCH 112/505] mm: remove zap_page_range and create zap_vma_pages zap_page_range was originally designed to unmap pages within an address range that could span multiple vmas. While working on [1], it was discovered that all callers of zap_page_range pass a range entirely within a single vma. In addition, the mmu notification call within zap_page range does not correctly handle ranges that span multiple vmas. When crossing a vma boundary, a new mmu_notifier_range_init/end call pair with the new vma should be made. Instead of fixing zap_page_range, do the following: - Create a new routine zap_vma_pages() that will remove all pages within the passed vma. Most users of zap_page_range pass the entire vma and can use this new routine. - For callers of zap_page_range not passing the entire vma, instead call zap_page_range_single(). - Remove zap_page_range. [1] https://lore.kernel.org/linux-mm/20221114235507.294320-2-mike.kravetz@oracle.com/ Link: https://lkml.kernel.org/r/20230104002732.232573-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: Peter Xu Acked-by: Michal Hocko Acked-by: Peter Xu Acked-by: Heiko Carstens [s390] Reviewed-by: Christoph Hellwig Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dave Hansen Cc: David Hildenbrand Cc: Eric Dumazet Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nadav Amit Cc: Palmer Dabbelt Cc: Rik van Riel Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/vdso.c | 6 ++--- arch/powerpc/kernel/vdso.c | 4 +--- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 3 +-- arch/riscv/kernel/vdso.c | 6 ++--- arch/s390/kernel/vdso.c | 4 +--- arch/s390/mm/gmap.c | 2 +- arch/x86/entry/vdso/vma.c | 4 +--- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 7 ++++-- mm/memory.c | 30 ------------------------- mm/page-writeback.c | 2 +- net/ipv4/tcp.c | 7 +++--- 13 files changed, 21 insertions(+), 58 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e59a32aa0c49..0119dc91abb5 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -138,13 +138,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #ifdef CONFIG_COMPAT_VDSO if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #endif } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 507f8228f983..7a2ff9010f17 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -120,10 +120,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, &vvar_spec)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); } mmap_read_unlock(mm); diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index eb5bed333750..9580e8e12165 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) /* * When the LPAR lost credits due to core removal or during * migration, invalidate the existing mapping for the current - * paste addresses and set windows in-active (zap_page_range in + * paste addresses and set windows in-active (zap_vma_pages in * reconfig_close_windows()). * New mapping will be done later after migration or new credits * available. So continue to receive faults if the user space diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 4ad6e510d405..559112312810 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -760,8 +760,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * is done before the original mmap() and after the ioctl. */ if (vma) - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start); + zap_vma_pages(vma); mmap_write_unlock(task_ref->mm); mutex_unlock(&task_ref->mmap_mutex); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index e410275918ac..5c30212d8d1c 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -124,13 +124,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, vdso_info.dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #ifdef CONFIG_COMPAT if (vma_is_special_mapping(vma, compat_vdso_info.dm)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); #endif } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index ff7bf4432229..bbaefd84f15e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -59,11 +59,9 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (!vma_is_special_mapping(vma, &vvar_mapping)) continue; - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); break; } mmap_read_unlock(mm); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 74e1d873dce0..69af6cdf1a2a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -722,7 +722,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); + zap_page_range_single(vma, vmaddr, size, NULL); } mmap_read_unlock(gmap->mm); } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f3f9b9e53c..ec5e4d2048cb 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -113,10 +113,8 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { - unsigned long size = vma->vm_end - vma->vm_start; - if (vma_is_special_mapping(vma, &vvar_mapping)) - zap_page_range(vma, vma->vm_start, size); + zap_vma_pages(vma); } mmap_read_unlock(mm); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 4ad42b0f75cd..55a3c3c2409f 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1019,7 +1019,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(vma, page_addr, PAGE_SIZE); + zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ac5ea4b584c..eb5bfc77c2c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1977,10 +1977,13 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ + zap_page_range_single(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); +} void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); diff --git a/mm/memory.c b/mm/memory.c index 1598051a2a24..b0dda866ffe6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,36 +1693,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, mmu_notifier_invalidate_range_end(&range); } -/** - * zap_page_range - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @start: starting address of pages to zap - * @size: number of bytes to zap - * - * Caller must protect the VMA list - */ -void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size) -{ - struct maple_tree *mt = &vma->vm_mm->mm_mt; - unsigned long end = start + size; - struct mmu_notifier_range range; - struct mmu_gather tlb; - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - - lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - start, start + size); - tlb_gather_mmu(&tlb, vma->vm_mm); - update_hiwater_rss(vma->vm_mm); - mmu_notifier_invalidate_range_start(&range); - do { - unmap_single_vma(&tlb, vma, start, range.end, NULL); - } while ((vma = mas_find(&mas, end - 1)) != NULL); - mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); -} - /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 337cafe9978c..e91f94b3438b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2690,7 +2690,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * * The caller must hold lock_page_memcg(). Most callers have the folio * locked. A few have the folio blocked from truncation through other - * means (eg zap_page_range() has it mapped and is holding the page table + * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I * cannot prove is always protected against truncate. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c567d5e8053e..f713c0422f0f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2092,7 +2092,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range(vma, *address, maybe_zap_len); + zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } @@ -2100,7 +2100,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, unsigned long leftover_pages = pages_remaining; int bytes_mapped; - /* We called zap_page_range, try to reinsert. */ + /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); @@ -2234,7 +2234,8 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range(vma, address, total_bytes_to_map); + zap_page_range_single(vma, address, total_bytes_to_map, + NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { From 183986209935b51d50de819eeae512a155be3568 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:50 +0000 Subject: [PATCH 113/505] MAINTAINERS: add types to akpm/mm git trees entries Patch series "mm: trivial fixups". This patchset is for trivial fixups of mm stuff on MAINTAINERS, tools/ selftests, and docs. This patch (of 5): Each SCM tree entry of MAINTAINERS file should have both type and location, but akpm/mm git tree entries of 'MEMORY MANAGEMENT' and 'VMALLOC' sections of the file don't have the type. Add the type. Link: https://lkml.kernel.org/r/20230103180754.129637-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 123216b76534..6871e24c2de5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13474,7 +13474,7 @@ M: Andrew Morton L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org -T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new F: include/linux/gfp.h F: include/linux/gfp_types.h @@ -13492,7 +13492,7 @@ R: Christoph Hellwig L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org -T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/linux/vmalloc.h F: mm/vmalloc.c From 060deca404ba7c2f499fcee793956c502c60e193 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:51 +0000 Subject: [PATCH 114/505] MAINTAINERS/MEMORY MANAGEMENT: add tools/vm/ as managed files 'tools/vm/' directory should be a part of memory management subsystem, but MAINTAINERS file doesn't mark the directory so. Add one more 'F:' entry for the directory to 'MEMORY MANAGEMENT' section. Link: https://lkml.kernel.org/r/20230103180754.129637-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6871e24c2de5..c05f95aa7af1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13484,6 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/testing/selftests/vm/ +F: tools/vm/ VMALLOC M: Andrew Morton From 799fb82aa132fa3a3886b7872997a5a84e820062 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:52 +0000 Subject: [PATCH 115/505] tools/vm: rename tools/vm to tools/mm Rename tools/vm to tools/mm for being more consistent with the code and documentation directories, and won't be confused with virtual machines. Link: https://lkml.kernel.org/r/20230103180754.129637-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/idle_page_tracking.rst | 2 +- Documentation/admin-guide/mm/pagemap.rst | 4 ++-- Documentation/mm/page_owner.rst | 2 +- Documentation/mm/slub.rst | 2 +- Documentation/translations/zh_CN/mm/page_owner.rst | 2 +- MAINTAINERS | 2 +- mm/Kconfig.debug | 2 +- mm/memory-failure.c | 2 +- tools/{vm => mm}/.gitignore | 0 tools/{vm => mm}/Makefile | 0 tools/{vm => mm}/page-types.c | 0 tools/{vm => mm}/page_owner_sort.c | 0 tools/{vm => mm}/slabinfo-gnuplot.sh | 0 tools/{vm => mm}/slabinfo.c | 0 14 files changed, 9 insertions(+), 9 deletions(-) rename tools/{vm => mm}/.gitignore (100%) rename tools/{vm => mm}/Makefile (100%) rename tools/{vm => mm}/page-types.c (100%) rename tools/{vm => mm}/page_owner_sort.c (100%) rename tools/{vm => mm}/slabinfo-gnuplot.sh (100%) rename tools/{vm => mm}/slabinfo.c (100%) diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst index df9394fb39c2..19492064278c 100644 --- a/Documentation/admin-guide/mm/idle_page_tracking.rst +++ b/Documentation/admin-guide/mm/idle_page_tracking.rst @@ -65,7 +65,7 @@ workload one should: are not reclaimable, he or she can filter them out using ``/proc/kpageflags``. -The page-types tool in the tools/vm directory can be used to assist in this. +The page-types tool in the tools/mm directory can be used to assist in this. If the tool is run initially with the appropriate option, it will mark all the queried pages as idle. Subsequent runs of the tool can then show which pages have their idle flag cleared in the interim. diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 6e2e416af783..ceb5da3172ba 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -46,7 +46,7 @@ There are four components to pagemap: * ``/proc/kpagecount``. This file contains a 64-bit count of the number of times each page is mapped, indexed by PFN. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the number of times a page is mapped. * ``/proc/kpageflags``. This file contains a 64-bit set of flags for each @@ -173,7 +173,7 @@ LRU related page flags 14 - SWAPBACKED The page is backed by swap/RAM. -The page-types tool in the tools/vm directory can be used to query the +The page-types tool in the tools/mm directory can be used to query the above flags. Using pagemap to do something useful diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 127514955a5e..5df26c0a0c1f 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -61,7 +61,7 @@ Usage 1) Build user-space helper:: - cd tools/vm + cd tools/mm make page_owner_sort 2) Enable page owner: add "page_owner=on" to boot cmdline. diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 7f652216dabe..3ffa7eded251 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -21,7 +21,7 @@ slabs that have data in them. See "slabinfo -h" for more options when running the command. ``slabinfo`` can be compiled with :: - gcc -o slabinfo tools/vm/slabinfo.c + gcc -o slabinfo tools/mm/slabinfo.c Some of the modes of operation of ``slabinfo`` require that slub debugging be enabled on the command line. F.e. no tracking information will be diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst index 21a6a0837d42..4d3b2c33e4ef 100644 --- a/Documentation/translations/zh_CN/mm/page_owner.rst +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -62,7 +62,7 @@ page owner在默认情况下是禁用的。所以,如果你想使用它,你 1) 构建用户空间的帮助:: - cd tools/vm + cd tools/mm make page_owner_sort 2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline. diff --git a/MAINTAINERS b/MAINTAINERS index c05f95aa7af1..c726adfd1f0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13483,8 +13483,8 @@ F: include/linux/mm.h F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ +F: tools/mm/ F: tools/testing/selftests/vm/ -F: tools/vm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index fca699ad1fb0..d62f48131952 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -90,7 +90,7 @@ config PAGE_OWNER help to find bare alloc_page(s) leaks. Even if you include this feature on your build, it is disabled in default. You should pass "page_owner=on" to boot parameter in order to enable it. Eats - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + a fair amount of memory if enabled. See tools/mm/page_owner_sort.c for user-space helper. If unsure, say N. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c77a9e37e27e..6bf07345ea2c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -24,7 +24,7 @@ * - You have a test that can be added to mce-test * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in - * tools/vm/page-types when running a real workload. + * tools/mm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back diff --git a/tools/vm/.gitignore b/tools/mm/.gitignore similarity index 100% rename from tools/vm/.gitignore rename to tools/mm/.gitignore diff --git a/tools/vm/Makefile b/tools/mm/Makefile similarity index 100% rename from tools/vm/Makefile rename to tools/mm/Makefile diff --git a/tools/vm/page-types.c b/tools/mm/page-types.c similarity index 100% rename from tools/vm/page-types.c rename to tools/mm/page-types.c diff --git a/tools/vm/page_owner_sort.c b/tools/mm/page_owner_sort.c similarity index 100% rename from tools/vm/page_owner_sort.c rename to tools/mm/page_owner_sort.c diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/mm/slabinfo-gnuplot.sh similarity index 100% rename from tools/vm/slabinfo-gnuplot.sh rename to tools/mm/slabinfo-gnuplot.sh diff --git a/tools/vm/slabinfo.c b/tools/mm/slabinfo.c similarity index 100% rename from tools/vm/slabinfo.c rename to tools/mm/slabinfo.c From baa489fabd01596d5426d6e112b34ba5fb59ab82 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:53 +0000 Subject: [PATCH 116/505] selftests/vm: rename selftests/vm to selftests/mm Rename selftets/vm to selftests/mm for being more consistent with the code, documentation, and tools directories, and won't be confused with virtual machines. [sj@kernel.org: convert missing vm->mm changes] Link: https://lkml.kernel.org/r/20230107230643.252273-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/hugetlbpage.rst | 6 +++--- Documentation/core-api/pin_user_pages.rst | 2 +- MAINTAINERS | 4 ++-- mm/Kconfig | 2 +- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kselftest_deps.sh | 6 +++--- tools/testing/selftests/{vm => mm}/.gitignore | 0 tools/testing/selftests/{vm => mm}/Makefile | 4 ++-- .../testing/selftests/{vm => mm}/charge_reserved_hugetlb.sh | 0 tools/testing/selftests/{vm => mm}/check_config.sh | 0 tools/testing/selftests/{vm => mm}/compaction_test.c | 0 tools/testing/selftests/{vm => mm}/config | 0 tools/testing/selftests/{vm => mm}/cow.c | 0 tools/testing/selftests/{vm => mm}/gup_test.c | 0 tools/testing/selftests/{vm => mm}/hmm-tests.c | 0 tools/testing/selftests/{vm => mm}/hugepage-mmap.c | 0 tools/testing/selftests/{vm => mm}/hugepage-mremap.c | 0 tools/testing/selftests/{vm => mm}/hugepage-shm.c | 0 tools/testing/selftests/{vm => mm}/hugepage-vmemmap.c | 0 tools/testing/selftests/{vm => mm}/hugetlb-madvise.c | 0 .../selftests/{vm => mm}/hugetlb_reparenting_test.sh | 0 tools/testing/selftests/{vm => mm}/khugepaged.c | 0 tools/testing/selftests/{vm => mm}/ksm_functional_tests.c | 0 tools/testing/selftests/{vm => mm}/ksm_tests.c | 0 tools/testing/selftests/{vm => mm}/madv_populate.c | 0 tools/testing/selftests/{vm => mm}/map_fixed_noreplace.c | 0 tools/testing/selftests/{vm => mm}/map_hugetlb.c | 0 tools/testing/selftests/{vm => mm}/map_populate.c | 0 tools/testing/selftests/{vm => mm}/memfd_secret.c | 0 tools/testing/selftests/{vm => mm}/migration.c | 0 tools/testing/selftests/{vm => mm}/mlock-random-test.c | 0 tools/testing/selftests/{vm => mm}/mlock2-tests.c | 0 tools/testing/selftests/{vm => mm}/mlock2.h | 0 tools/testing/selftests/{vm => mm}/mrelease_test.c | 0 tools/testing/selftests/{vm => mm}/mremap_dontunmap.c | 0 tools/testing/selftests/{vm => mm}/mremap_test.c | 0 tools/testing/selftests/{vm => mm}/on-fault-limit.c | 0 tools/testing/selftests/{vm => mm}/pkey-helpers.h | 0 tools/testing/selftests/{vm => mm}/pkey-powerpc.h | 0 tools/testing/selftests/{vm => mm}/pkey-x86.h | 0 tools/testing/selftests/{vm => mm}/protection_keys.c | 0 tools/testing/selftests/{vm => mm}/run_vmtests.sh | 0 tools/testing/selftests/{vm => mm}/settings | 0 tools/testing/selftests/{vm => mm}/soft-dirty.c | 0 tools/testing/selftests/{vm => mm}/split_huge_page_test.c | 0 tools/testing/selftests/{vm => mm}/test_hmm.sh | 0 tools/testing/selftests/{vm => mm}/test_vmalloc.sh | 0 tools/testing/selftests/{vm => mm}/thuge-gen.c | 0 tools/testing/selftests/{vm => mm}/transhuge-stress.c | 0 tools/testing/selftests/{vm => mm}/userfaultfd.c | 0 tools/testing/selftests/{vm => mm}/util.h | 0 tools/testing/selftests/{vm => mm}/va_128TBswitch.c | 0 tools/testing/selftests/{vm => mm}/va_128TBswitch.sh | 0 tools/testing/selftests/{vm => mm}/virtual_address_range.c | 0 tools/testing/selftests/{vm => mm}/vm_util.c | 0 tools/testing/selftests/{vm => mm}/vm_util.h | 0 tools/testing/selftests/{vm => mm}/write_hugetlb_memory.sh | 0 tools/testing/selftests/{vm => mm}/write_to_hugetlbfs.c | 0 58 files changed, 13 insertions(+), 13 deletions(-) rename tools/testing/selftests/{vm => mm}/.gitignore (100%) rename tools/testing/selftests/{vm => mm}/Makefile (98%) rename tools/testing/selftests/{vm => mm}/charge_reserved_hugetlb.sh (100%) rename tools/testing/selftests/{vm => mm}/check_config.sh (100%) rename tools/testing/selftests/{vm => mm}/compaction_test.c (100%) rename tools/testing/selftests/{vm => mm}/config (100%) rename tools/testing/selftests/{vm => mm}/cow.c (100%) rename tools/testing/selftests/{vm => mm}/gup_test.c (100%) rename tools/testing/selftests/{vm => mm}/hmm-tests.c (100%) rename tools/testing/selftests/{vm => mm}/hugepage-mmap.c (100%) rename tools/testing/selftests/{vm => mm}/hugepage-mremap.c (100%) rename tools/testing/selftests/{vm => mm}/hugepage-shm.c (100%) rename tools/testing/selftests/{vm => mm}/hugepage-vmemmap.c (100%) rename tools/testing/selftests/{vm => mm}/hugetlb-madvise.c (100%) rename tools/testing/selftests/{vm => mm}/hugetlb_reparenting_test.sh (100%) rename tools/testing/selftests/{vm => mm}/khugepaged.c (100%) rename tools/testing/selftests/{vm => mm}/ksm_functional_tests.c (100%) rename tools/testing/selftests/{vm => mm}/ksm_tests.c (100%) rename tools/testing/selftests/{vm => mm}/madv_populate.c (100%) rename tools/testing/selftests/{vm => mm}/map_fixed_noreplace.c (100%) rename tools/testing/selftests/{vm => mm}/map_hugetlb.c (100%) rename tools/testing/selftests/{vm => mm}/map_populate.c (100%) rename tools/testing/selftests/{vm => mm}/memfd_secret.c (100%) rename tools/testing/selftests/{vm => mm}/migration.c (100%) rename tools/testing/selftests/{vm => mm}/mlock-random-test.c (100%) rename tools/testing/selftests/{vm => mm}/mlock2-tests.c (100%) rename tools/testing/selftests/{vm => mm}/mlock2.h (100%) rename tools/testing/selftests/{vm => mm}/mrelease_test.c (100%) rename tools/testing/selftests/{vm => mm}/mremap_dontunmap.c (100%) rename tools/testing/selftests/{vm => mm}/mremap_test.c (100%) rename tools/testing/selftests/{vm => mm}/on-fault-limit.c (100%) rename tools/testing/selftests/{vm => mm}/pkey-helpers.h (100%) rename tools/testing/selftests/{vm => mm}/pkey-powerpc.h (100%) rename tools/testing/selftests/{vm => mm}/pkey-x86.h (100%) rename tools/testing/selftests/{vm => mm}/protection_keys.c (100%) rename tools/testing/selftests/{vm => mm}/run_vmtests.sh (100%) mode change 100755 => 100644 rename tools/testing/selftests/{vm => mm}/settings (100%) rename tools/testing/selftests/{vm => mm}/soft-dirty.c (100%) rename tools/testing/selftests/{vm => mm}/split_huge_page_test.c (100%) rename tools/testing/selftests/{vm => mm}/test_hmm.sh (100%) mode change 100755 => 100644 rename tools/testing/selftests/{vm => mm}/test_vmalloc.sh (100%) mode change 100755 => 100644 rename tools/testing/selftests/{vm => mm}/thuge-gen.c (100%) rename tools/testing/selftests/{vm => mm}/transhuge-stress.c (100%) rename tools/testing/selftests/{vm => mm}/userfaultfd.c (100%) rename tools/testing/selftests/{vm => mm}/util.h (100%) rename tools/testing/selftests/{vm => mm}/va_128TBswitch.c (100%) rename tools/testing/selftests/{vm => mm}/va_128TBswitch.sh (100%) mode change 100755 => 100644 rename tools/testing/selftests/{vm => mm}/virtual_address_range.c (100%) rename tools/testing/selftests/{vm => mm}/vm_util.c (100%) rename tools/testing/selftests/{vm => mm}/vm_util.h (100%) rename tools/testing/selftests/{vm => mm}/write_hugetlb_memory.sh (100%) rename tools/testing/selftests/{vm => mm}/write_to_hugetlbfs.c (100%) diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 19f27c0d92e0..a969a2c742b2 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -461,13 +461,13 @@ Examples .. _map_hugetlb: ``map_hugetlb`` - see tools/testing/selftests/vm/map_hugetlb.c + see tools/testing/selftests/mm/map_hugetlb.c ``hugepage-shm`` - see tools/testing/selftests/vm/hugepage-shm.c + see tools/testing/selftests/mm/hugepage-shm.c ``hugepage-mmap`` - see tools/testing/selftests/vm/hugepage-mmap.c + see tools/testing/selftests/mm/hugepage-mmap.c The `libhugetlbfs`_ library provides a wide range of userspace tools to help with huge page usability, environment setup, and control. diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index b18416f4500f..facafbdecb95 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -221,7 +221,7 @@ Unit testing ============ This file:: - tools/testing/selftests/vm/gup_test.c + tools/testing/selftests/mm/gup_test.c has the following new calls to exercise the new pin*() wrapper functions: diff --git a/MAINTAINERS b/MAINTAINERS index c726adfd1f0d..8ac1472bea34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9466,7 +9466,7 @@ F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* -F: tools/testing/selftests/vm/*hmm* +F: tools/testing/selftests/mm/*hmm* HOST AP DRIVER M: Jouni Malinen @@ -13484,7 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/mm/ -F: tools/testing/selftests/vm/ +F: tools/testing/selftests/mm/ VMALLOC M: Andrew Morton diff --git a/mm/Kconfig b/mm/Kconfig index ff7b209dec05..39df30dcabe3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1073,7 +1073,7 @@ config GUP_TEST pin_user_pages*(), or pinned via get_user_pages*(), as specified by other command line arguments. - See tools/testing/selftests/vm/gup_test.c + See tools/testing/selftests/mm/gup_test.c comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 41b649452560..56a29f2de8e6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -85,7 +85,7 @@ TARGETS += tmpfs TARGETS += tpm2 TARGETS += user TARGETS += vDSO -TARGETS += vm +TARGETS += mm TARGETS += x86 TARGETS += zram #Please keep the TARGETS list alphabetically sorted diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index 7424a1f5babc..4bc14d9e8ff1 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -12,9 +12,9 @@ usage() echo -e "Usage: $0 -[p] [test_name]\n" echo -e "\tkselftest_deps.sh [-p] gcc" -echo -e "\tkselftest_deps.sh [-p] gcc vm" +echo -e "\tkselftest_deps.sh [-p] gcc mm" echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc" -echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n" +echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n" echo "- Should be run in selftests directory in the kernel repo." echo "- Checks if Kselftests can be built/cross-built on a system." echo "- Parses all test/sub-test Makefile to find library dependencies." @@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ # Level 2 # Some tests have multiple valid LDLIBS lines for individual sub-tests # that need dependency checks. Find them and append them to the tests -# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread +# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread # Filter out VAR_LDLIBS to discard the following: # memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) # Append space at the end of the list to append more tests. diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/mm/.gitignore similarity index 100% rename from tools/testing/selftests/vm/.gitignore rename to tools/testing/selftests/mm/.gitignore diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/mm/Makefile similarity index 98% rename from tools/testing/selftests/vm/Makefile rename to tools/testing/selftests/mm/Makefile index 89c14e41bd43..6a4b639b2b2b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -# Makefile for vm selftests +# Makefile for mm selftests -LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h +LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh similarity index 100% rename from tools/testing/selftests/vm/charge_reserved_hugetlb.sh rename to tools/testing/selftests/mm/charge_reserved_hugetlb.sh diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/mm/check_config.sh similarity index 100% rename from tools/testing/selftests/vm/check_config.sh rename to tools/testing/selftests/mm/check_config.sh diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c similarity index 100% rename from tools/testing/selftests/vm/compaction_test.c rename to tools/testing/selftests/mm/compaction_test.c diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/mm/config similarity index 100% rename from tools/testing/selftests/vm/config rename to tools/testing/selftests/mm/config diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/mm/cow.c similarity index 100% rename from tools/testing/selftests/vm/cow.c rename to tools/testing/selftests/mm/cow.c diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/mm/gup_test.c similarity index 100% rename from tools/testing/selftests/vm/gup_test.c rename to tools/testing/selftests/mm/gup_test.c diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c similarity index 100% rename from tools/testing/selftests/vm/hmm-tests.c rename to tools/testing/selftests/mm/hmm-tests.c diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c similarity index 100% rename from tools/testing/selftests/vm/hugepage-mmap.c rename to tools/testing/selftests/mm/hugepage-mmap.c diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c similarity index 100% rename from tools/testing/selftests/vm/hugepage-mremap.c rename to tools/testing/selftests/mm/hugepage-mremap.c diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c similarity index 100% rename from tools/testing/selftests/vm/hugepage-shm.c rename to tools/testing/selftests/mm/hugepage-shm.c diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c similarity index 100% rename from tools/testing/selftests/vm/hugepage-vmemmap.c rename to tools/testing/selftests/mm/hugepage-vmemmap.c diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c similarity index 100% rename from tools/testing/selftests/vm/hugetlb-madvise.c rename to tools/testing/selftests/mm/hugetlb-madvise.c diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh similarity index 100% rename from tools/testing/selftests/vm/hugetlb_reparenting_test.sh rename to tools/testing/selftests/mm/hugetlb_reparenting_test.sh diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c similarity index 100% rename from tools/testing/selftests/vm/khugepaged.c rename to tools/testing/selftests/mm/khugepaged.c diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c similarity index 100% rename from tools/testing/selftests/vm/ksm_functional_tests.c rename to tools/testing/selftests/mm/ksm_functional_tests.c diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c similarity index 100% rename from tools/testing/selftests/vm/ksm_tests.c rename to tools/testing/selftests/mm/ksm_tests.c diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c similarity index 100% rename from tools/testing/selftests/vm/madv_populate.c rename to tools/testing/selftests/mm/madv_populate.c diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c similarity index 100% rename from tools/testing/selftests/vm/map_fixed_noreplace.c rename to tools/testing/selftests/mm/map_fixed_noreplace.c diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c similarity index 100% rename from tools/testing/selftests/vm/map_hugetlb.c rename to tools/testing/selftests/mm/map_hugetlb.c diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/mm/map_populate.c similarity index 100% rename from tools/testing/selftests/vm/map_populate.c rename to tools/testing/selftests/mm/map_populate.c diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c similarity index 100% rename from tools/testing/selftests/vm/memfd_secret.c rename to tools/testing/selftests/mm/memfd_secret.c diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/mm/migration.c similarity index 100% rename from tools/testing/selftests/vm/migration.c rename to tools/testing/selftests/mm/migration.c diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c similarity index 100% rename from tools/testing/selftests/vm/mlock-random-test.c rename to tools/testing/selftests/mm/mlock-random-test.c diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c similarity index 100% rename from tools/testing/selftests/vm/mlock2-tests.c rename to tools/testing/selftests/mm/mlock2-tests.c diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/mm/mlock2.h similarity index 100% rename from tools/testing/selftests/vm/mlock2.h rename to tools/testing/selftests/mm/mlock2.h diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c similarity index 100% rename from tools/testing/selftests/vm/mrelease_test.c rename to tools/testing/selftests/mm/mrelease_test.c diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c similarity index 100% rename from tools/testing/selftests/vm/mremap_dontunmap.c rename to tools/testing/selftests/mm/mremap_dontunmap.c diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c similarity index 100% rename from tools/testing/selftests/vm/mremap_test.c rename to tools/testing/selftests/mm/mremap_test.c diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c similarity index 100% rename from tools/testing/selftests/vm/on-fault-limit.c rename to tools/testing/selftests/mm/on-fault-limit.c diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h similarity index 100% rename from tools/testing/selftests/vm/pkey-helpers.h rename to tools/testing/selftests/mm/pkey-helpers.h diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h similarity index 100% rename from tools/testing/selftests/vm/pkey-powerpc.h rename to tools/testing/selftests/mm/pkey-powerpc.h diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h similarity index 100% rename from tools/testing/selftests/vm/pkey-x86.h rename to tools/testing/selftests/mm/pkey-x86.h diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c similarity index 100% rename from tools/testing/selftests/vm/protection_keys.c rename to tools/testing/selftests/mm/protection_keys.c diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh old mode 100755 new mode 100644 similarity index 100% rename from tools/testing/selftests/vm/run_vmtests.sh rename to tools/testing/selftests/mm/run_vmtests.sh diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/mm/settings similarity index 100% rename from tools/testing/selftests/vm/settings rename to tools/testing/selftests/mm/settings diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c similarity index 100% rename from tools/testing/selftests/vm/soft-dirty.c rename to tools/testing/selftests/mm/soft-dirty.c diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c similarity index 100% rename from tools/testing/selftests/vm/split_huge_page_test.c rename to tools/testing/selftests/mm/split_huge_page_test.c diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh old mode 100755 new mode 100644 similarity index 100% rename from tools/testing/selftests/vm/test_hmm.sh rename to tools/testing/selftests/mm/test_hmm.sh diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh old mode 100755 new mode 100644 similarity index 100% rename from tools/testing/selftests/vm/test_vmalloc.sh rename to tools/testing/selftests/mm/test_vmalloc.sh diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c similarity index 100% rename from tools/testing/selftests/vm/thuge-gen.c rename to tools/testing/selftests/mm/thuge-gen.c diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c similarity index 100% rename from tools/testing/selftests/vm/transhuge-stress.c rename to tools/testing/selftests/mm/transhuge-stress.c diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c similarity index 100% rename from tools/testing/selftests/vm/userfaultfd.c rename to tools/testing/selftests/mm/userfaultfd.c diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/mm/util.h similarity index 100% rename from tools/testing/selftests/vm/util.h rename to tools/testing/selftests/mm/util.h diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c similarity index 100% rename from tools/testing/selftests/vm/va_128TBswitch.c rename to tools/testing/selftests/mm/va_128TBswitch.c diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh old mode 100755 new mode 100644 similarity index 100% rename from tools/testing/selftests/vm/va_128TBswitch.sh rename to tools/testing/selftests/mm/va_128TBswitch.sh diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c similarity index 100% rename from tools/testing/selftests/vm/virtual_address_range.c rename to tools/testing/selftests/mm/virtual_address_range.c diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/mm/vm_util.c similarity index 100% rename from tools/testing/selftests/vm/vm_util.c rename to tools/testing/selftests/mm/vm_util.c diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/mm/vm_util.h similarity index 100% rename from tools/testing/selftests/vm/vm_util.h rename to tools/testing/selftests/mm/vm_util.h diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh similarity index 100% rename from tools/testing/selftests/vm/write_hugetlb_memory.sh rename to tools/testing/selftests/mm/write_hugetlb_memory.sh diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c similarity index 100% rename from tools/testing/selftests/vm/write_to_hugetlbfs.c rename to tools/testing/selftests/mm/write_to_hugetlbfs.c From 6c364edc194e104566f4de72f7a2af5b8fc17110 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 3 Jan 2023 18:07:54 +0000 Subject: [PATCH 117/505] Docs/admin-guide/mm/numaperf: increase depth of subsections Each section of numaperf.rst has zero depth, and therefore be exposed to the index of admin-guide/mm. Especially 'See Also' section on the index makes the document weird. Hide the sections from the index by giving the document a title and increasing the depth of each section. [sj@kernel.org: change title to fix duplicate label warning] Link: https://lkml.kernel.org/r/20230106194927.152663-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230103180754.129637-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/numaperf.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index 166697325947..544a6d16c801 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -1,6 +1,9 @@ .. _numaperf: -============= +======================= +NUMA Memory Performance +======================= + NUMA Locality ============= @@ -61,7 +64,6 @@ that are CPUs and hence suitable for generic task scheduling, and IO initiators such as GPUs and NICs. Unlike access class 0, only nodes containing CPUs are considered. -================ NUMA Performance ================ @@ -96,7 +98,6 @@ for the platform. Access class 1 takes the same form but only includes values for CPU to memory activity. -========== NUMA Cache ========== @@ -170,7 +171,6 @@ The "size" is the number of bytes provided by this cache level. The "write_policy" will be 0 for write-back, and non-zero for write-through caching. -======== See Also ======== From 4b89a37d54a0b5ed6b2e5a9afc44a15a22e563f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 3 Jan 2023 11:44:30 +0100 Subject: [PATCH 118/505] fs: don't allocate blocks beyond EOF from __mpage_writepage When __mpage_writepage() is called for a page beyond EOF, it will go and allocate all blocks underlying the page. This is not only unnecessary but this way blocks can get leaked (e.g. if a page beyond EOF is marked dirty but in the end write fails and i_size is not extended). Link: https://lkml.kernel.org/r/20230103104430.27749-1-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Al Viro Signed-off-by: Andrew Morton --- fs/mpage.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/mpage.c b/fs/mpage.c index d36a95473f77..b8e7975159bc 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -524,6 +524,12 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, */ BUG_ON(!PageUptodate(page)); block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + /* + * Whole page beyond EOF? Skip allocating blocks to avoid leaking + * space. + */ + if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) + goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { From df32de1433412621b92daf1b3369ac053214031e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Jan 2023 12:01:19 +0900 Subject: [PATCH 119/505] zram: correctly handle all next_arg() cases When supplied buffer does not have assignment sign next_arg() sets `val` pointer to NULL, so we cannot dereference it. Add a NULL pointer test to handle `param` case, in addition to `*val` test, which handles cases when param has no value assigned to it: `param=`. Link: https://lkml.kernel.org/r/20230103030119.1496358-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7becd5448791..5d1088a645e3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1140,7 +1140,7 @@ static ssize_t recomp_algorithm_store(struct device *dev, while (*args) { args = next_arg(args, ¶m, &val); - if (!*val) + if (!val || !*val) return -EINVAL; if (!strcmp(param, "algo")) { @@ -1824,7 +1824,7 @@ static ssize_t recompress_store(struct device *dev, while (*args) { args = next_arg(args, ¶m, &val); - if (!*val) + if (!val || !*val) return -EINVAL; if (!strcmp(param, "type")) { From da0618c146ca0e1412173a8a229dd737a73b1a4f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 2 Jan 2023 16:11:26 +0000 Subject: [PATCH 120/505] selftest/vm: add mremap expand merge offset test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test to assert that we can mremap() and expand a mapping starting from an offset within an existing mapping. We unmap the last page in a 3 page mapping to ensure that the remap should always succeed, before remapping from the 2nd page. This is additionally a regression test for the issue solved in "mm, mremap: fix mremap() expanding vma with addr inside vma" and confirmed to fail prior to the change and pass after it. Finally, this patch updates the existing mremap expand merge test to check error conditions and reduce code duplication between the two tests. [lstoakes@gmail.com: increment num_expand_tests so test doesn't complain about unexpected tests being run] Link: https://lkml.kernel.org/r/8ff3ba3cadc0b6c1b2688ae5c851bf73aa062d57.1673701836.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/02b117a8ffd52acc01dc66c2fb39754f08d92c0e.1672675824.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Jakub Matěna Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 123 ++++++++++++++++++----- 1 file changed, 98 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 9496346973d4..5c3773de9f0f 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -118,6 +118,33 @@ static unsigned long long get_mmap_min_addr(void) return addr; } +/* + * Using /proc/self/maps, assert that the specified address range is contained + * within a single mapping. + */ +static bool is_range_mapped(FILE *maps_fp, void *start, void *end) +{ + char *line = NULL; + size_t len = 0; + bool success = false; + + rewind(maps_fp); + + while (getline(&line, &len, maps_fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val <= start && second_val >= end) { + success = true; + break; + } + } + + return success; +} + /* * This test validates that merge is called when expanding a mapping. * Mapping containing three pages is created, middle page is unmapped @@ -125,41 +152,76 @@ static unsigned long long get_mmap_min_addr(void) * it fills the created hole. The two parts should merge creating * single mapping with three pages. */ -static void mremap_expand_merge(unsigned long page_size) +static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) { char *test_name = "mremap expand merge"; - FILE *fp; - char *line = NULL; - size_t len = 0; bool success = false; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } munmap(start + page_size, page_size); - mremap(start, page_size, 2 * page_size, 0); - - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) { - ksft_test_result_fail("%s\n", test_name); - return; + remap = mremap(start, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, page_size); + munmap(start + 2 * page_size, page_size); + goto out; } - while (getline(&line, &len, fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); - if (first_val == start && second_val == start + 3 * page_size) { - success = true; - break; - } - } +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Similar to mremap_expand_merge() except instead of removing the middle page, + * we remove the last then attempt to remap offset from the second page. This + * should result in the mapping being restored to its former state. + */ +static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) +{ + + char *test_name = "mremap expand merge offset"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + /* Unmap final page to ensure we have space to expand. */ + munmap(start + 2 * page_size, page_size); + remap = mremap(start + page_size, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, 2 * page_size); + goto out; + } + + success = is_range_mapped(maps_fp, start, start + 3 * page_size); + munmap(start, 3 * page_size); + +out: if (success) ksft_test_result_pass("%s\n", test_name); else ksft_test_result_fail("%s\n", test_name); - fclose(fp); } /* @@ -380,11 +442,12 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; - int num_expand_tests = 1; + int num_expand_tests = 2; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; time_t t; + FILE *maps_fp; pattern_seed = (unsigned int) time(&t); @@ -458,7 +521,17 @@ int main(int argc, char **argv) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); - mremap_expand_merge(page_size); + maps_fp = fopen("/proc/self/maps", "r"); + + if (maps_fp == NULL) { + ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + mremap_expand_merge(maps_fp, page_size); + mremap_expand_merge_offset(maps_fp, page_size); + + fclose(maps_fp); if (run_perf_tests) { ksft_print_msg("\n%s\n", From fc4f4be9b5271e43eeb4c675d190fa9734de9ea3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:54 +0100 Subject: [PATCH 121/505] mm/nommu: factor out check for NOMMU shared mappings into is_nommu_shared_mapping() Patch series "mm/nommu: don't use VM_MAYSHARE for MAP_PRIVATE mappings". Trying to reduce the confusion around VM_SHARED and VM_MAYSHARE first requires !CONFIG_MMU to stop using VM_MAYSHARE for MAP_PRIVATE mappings. CONFIG_MMU only sets VM_MAYSHARE for MAP_SHARED mappings. This paves the way for further VM_MAYSHARE and VM_SHARED cleanups: for example, renaming VM_MAYSHARED to VM_MAP_SHARED to make it cleaner what is actually means. Let's first get the weird case out of the way and not use VM_MAYSHARE in MAP_PRIVATE mappings, using a new VM_MAYOVERLAY flag instead. This patch (of 3): We want to stop using VM_MAYSHARE in private mappings to pave the way for clarifying the semantics of VM_MAYSHARE vs. VM_SHARED and reduce the confusion. While CONFIG_MMU uses VM_MAYSHARE to represent MAP_SHARED, !CONFIG_MMU also sets VM_MAYSHARE for selected R/O private file mappings that are an effective overlay of a file mapping. Let's factor out all relevant VM_MAYSHARE checks in !CONFIG_MMU code into is_nommu_shared_mapping() first. Note that whenever VM_SHARED is set, VM_MAYSHARE must be set as well (unless there is a serious BUG). So there is not need to test for VM_SHARED manually. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-1-david@redhat.com Link: https://lkml.kernel.org/r/20230102160856.500584-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- fs/cramfs/inode.c | 2 +- fs/proc/task_nommu.c | 2 +- fs/ramfs/file-nommu.c | 2 +- fs/romfs/mmap-nommu.c | 2 +- include/linux/mm.h | 15 +++++++++++++++ io_uring/io_uring.c | 2 +- mm/nommu.c | 11 ++++++----- 8 files changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 83bf2a4dcb57..ffb101d349f0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -343,7 +343,7 @@ static unsigned zero_mmap_capabilities(struct file *file) /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_struct *vma) { - return vma->vm_flags & VM_MAYSHARE; + return is_nommu_shared_mapping(vma->vm_flags); } #else diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 61ccf7722fc3..50e4e060db68 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -437,7 +437,7 @@ bailout: static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2fd06f52b6a4..0ec35072a8e5 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -38,7 +38,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index cb240eac5036..cd4537692751 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) + if (!is_nommu_shared_mapping(vma->vm_flags)) return -ENOSYS; file_accessed(file); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 2c4a23113fb5..4578dc45e50a 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb5bfc77c2c2..791bac40bf8e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1347,6 +1347,21 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ + /* + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. + */ + return flags & VM_MAYSHARE; +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ac1cd8d23ea..3a934f733136 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3206,7 +3206,7 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) diff --git a/mm/nommu.c b/mm/nommu.c index 5b83938ecb67..1671ebbecb8d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -958,9 +958,10 @@ static int do_mmap_private(struct vm_area_struct *vma, */ if (capabilities & NOMMU_MAP_DIRECT) { ret = call_mmap(vma->vm_file, vma); + /* shouldn't return success if we're not sharing */ + if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) + ret = -ENOSYS; if (ret == 0) { - /* shouldn't return success if we're not sharing */ - BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; } @@ -1106,7 +1107,7 @@ unsigned long do_mmap(struct file *file, * these cases, sharing is handled in the driver or filesystem rather * than here */ - if (vm_flags & VM_MAYSHARE) { + if (is_nommu_shared_mapping(vm_flags)) { struct vm_region *pregion; unsigned long pglen, rpglen, pgend, rpgend, start; @@ -1116,7 +1117,7 @@ unsigned long do_mmap(struct file *file, for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(pregion->vm_flags & VM_MAYSHARE)) + if (!is_nommu_shared_mapping(pregion->vm_flags)) continue; /* search for overlapping mappings on the same file */ @@ -1600,7 +1601,7 @@ static unsigned long do_mremap(unsigned long addr, if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vma->vm_flags & VM_MAYSHARE) + if (is_nommu_shared_mapping(vma->vm_flags)) return (unsigned long) -EPERM; if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) From b6b7a8faf05c709cd9f63d3b7d9c66bd91bc3b0d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:55 +0100 Subject: [PATCH 122/505] mm/nommu: don't use VM_MAYSHARE for MAP_PRIVATE mappings Let's stop using VM_MAYSHARE for MAP_PRIVATE mappings and use VM_MAYOVERLAY instead. Rewrite determine_vm_flags() to make the whole logic easier to digest, and to cleanly separate MAP_PRIVATE vs. MAP_SHARED. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++- mm/nommu.c | 53 +++++++++++++++++++++++++++------------------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 791bac40bf8e..8a8563359946 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -276,7 +276,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ +#else /* CONFIG_MMU */ +#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ +#define VM_UFFD_MISSING 0 +#endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ @@ -1358,7 +1363,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ - return flags & VM_MAYSHARE; + return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif diff --git a/mm/nommu.c b/mm/nommu.c index 1671ebbecb8d..df1711acdf5b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -892,28 +892,35 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); - /* vm_flags |= mm->def_flags; */ - if (!(capabilities & NOMMU_MAP_DIRECT)) { - /* attempt to share read-only copies of mapped file chunks */ + if (!file) { + /* + * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because + * there is no fork(). + */ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (file && !(prot & PROT_WRITE)) - vm_flags |= VM_MAYSHARE; - } else { - /* overlay a shareable mapping on the backing device or inode - * if possible - used for chardevs, ramfs/tmpfs/shmfs and - * romfs/cramfs */ - vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); - if (flags & MAP_SHARED) - vm_flags |= VM_SHARED; - } + } else if (flags & MAP_PRIVATE) { + /* MAP_PRIVATE file mapping */ + if (capabilities & NOMMU_MAP_DIRECT) + vm_flags |= (capabilities & NOMMU_VMFLAGS); + else + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - /* refuse to let anyone share private mappings with this process if - * it's being traced - otherwise breakpoints set in it may interfere - * with another untraced process - */ - if ((flags & MAP_PRIVATE) && current->ptrace) - vm_flags &= ~VM_MAYSHARE; + if (!(prot & PROT_WRITE) && !current->ptrace) + /* + * R/O private file mapping which cannot be used to + * modify memory, especially also not via active ptrace + * (e.g., set breakpoints) or later by upgrading + * permissions (no mprotect()). We can try overlaying + * the file mapping, which will work e.g., on chardevs, + * ramfs/tmpfs/shmfs and romfs/cramf. + */ + vm_flags |= VM_MAYOVERLAY; + } else { + /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */ + vm_flags |= VM_SHARED | VM_MAYSHARE | + (capabilities & NOMMU_VMFLAGS); + } return vm_flags; } @@ -952,9 +959,11 @@ static int do_mmap_private(struct vm_area_struct *vma, void *base; int ret, order; - /* invoke the file's mapping function so that it can keep track of - * shared mappings on devices or memory - * - VM_MAYSHARE will be set if it may attempt to share + /* + * Invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory. VM_MAYOVERLAY will be set if + * it may attempt to share, which will make is_nommu_shared_mapping() + * happy. */ if (capabilities & NOMMU_MAP_DIRECT) { ret = call_mmap(vma->vm_file, vma); From 997931ce02b72f15c40e742ee035ce5643ba574f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Jan 2023 17:08:56 +0100 Subject: [PATCH 123/505] drivers/misc/open-dice: don't touch VM_MAYSHARE A MAP_SHARED mapping always has VM_MAYSHARE set, and writable (VM_MAYWRITE) MAP_SHARED mappings have VM_SHARED set as well. To identify a MAP_SHARED mapping, it's sufficient to look at VM_MAYSHARE. We cannot have VM_MAYSHARE|VM_WRITE mappings without having VM_SHARED set. Consequently, current code will never actually end up clearing VM_MAYSHARE and that code is confusing, because nobody is supposed to mess with VM_MAYWRITE. Let's clean it up and restructure the code. No functional change intended. Link: https://lkml.kernel.org/r/20230102160856.500584-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: Nicolas Pitre Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- drivers/misc/open-dice.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index c61be3404c6f..9dda47b3fd70 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -90,15 +90,13 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) { struct open_dice_drvdata *drvdata = to_open_dice_drvdata(filp); - /* Do not allow userspace to modify the underlying data. */ - if ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED)) - return -EPERM; - - /* Ensure userspace cannot acquire VM_WRITE + VM_SHARED later. */ - if (vma->vm_flags & VM_WRITE) - vma->vm_flags &= ~VM_MAYSHARE; - else if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) { + /* Do not allow userspace to modify the underlying data. */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* Ensure userspace cannot acquire VM_WRITE later. */ vma->vm_flags &= ~VM_MAYWRITE; + } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); From 8788f6781486769d9598dcaedc3fe0eb12fc3e59 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 30 Dec 2022 14:52:51 -0700 Subject: [PATCH 124/505] mm: add vma_has_recency() Add vma_has_recency() to indicate whether a VMA may exhibit temporal locality that the LRU algorithm relies on. This function returns false for VMAs marked by VM_SEQ_READ or VM_RAND_READ. While the former flag indicates linear access, i.e., a special case of spatial locality, both flags indicate a lack of temporal locality, i.e., the reuse of an area within a relatively small duration. "Recency" is chosen over "locality" to avoid confusion between temporal and spatial localities. Before this patch, the active/inactive LRU only ignored the accessed bit from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive LRU and MGLRU share the same logic: they both ignore the accessed bit if vma_has_recency() returns false. For the active/inactive LRU, the following fio test showed a [6, 8]% increase in IOPS when randomly accessing mapped files under memory pressure. kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) kb=$((kb - 8*1024*1024)) modprobe brd rd_nr=1 rd_size=$kb dd if=/dev/zero of=/dev/ram0 bs=1M mkfs.ext4 /dev/ram0 mount /dev/ram0 /mnt/ swapoff -a fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \ --size=8G --rw=randrw --time_based --runtime=10m \ --group_reporting The discussion that led to this patch is here [1]. Additional test results are available in that thread. [1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/ Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alexander Viro Cc: Andrea Righi Cc: Johannes Weiner Cc: Michael Larabel Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 8 ++++++++ mm/memory.c | 7 +++---- mm/rmap.c | 42 +++++++++++++++++---------------------- mm/vmscan.c | 5 ++++- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index acf03147fff8..4abebf2615a3 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -594,4 +594,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, #endif } +static inline bool vma_has_recency(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + + return true; +} + #endif diff --git a/mm/memory.c b/mm/memory.c index b0dda866ffe6..90f8f72777c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1402,8 +1402,7 @@ again: force_flush = 1; } } - if (pte_young(ptent) && - likely(!(vma->vm_flags & VM_SEQ_READ))) + if (pte_young(ptent) && likely(vma_has_recency(vma))) mark_page_accessed(page); } rss[mm_counter(page)]--; @@ -5115,8 +5114,8 @@ static inline void mm_account_fault(struct pt_regs *regs, #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { - /* the LRU algorithm doesn't apply to sequential or random reads */ - current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); + /* the LRU algorithm only applies to accesses with recency */ + current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) diff --git a/mm/rmap.c b/mm/rmap.c index 32e48b1c5847..ab74e0547a52 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte) && - !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + if (lru_gen_enabled() && pte_young(*pvmw.pte)) { lru_gen_look_around(&pvmw); referenced++; } if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* - * Don't treat a reference through - * a sequentially read mapping as such. - * If the folio has been used in another mapping, - * we will catch it; if this other mapping is - * already gone, the unmap path will have set - * the referenced flag or activated the folio. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) - referenced++; - } + pvmw.pte)) + referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; - if (!mm_match_cgroup(vma->vm_mm, memcg)) + /* + * Ignore references from this mapping if it has no recency. If the + * folio has been used in another mapping, we will catch it; if this + * other mapping is already gone, the unmap path will have set the + * referenced flag or activated the folio in zap_pte_range(). + */ + if (!vma_has_recency(vma)) + return true; + + /* + * If we are reclaiming on behalf of a cgroup, skip counting on behalf + * of references from different cgroups. + */ + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; @@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, + .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; @@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked, return 1; } - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg) { - rwc.invalid_vma = invalid_folio_referenced_vma; - } - rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7c3fd900a89d..fe30b8c43f92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3794,7 +3794,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal if (is_vm_hugetlb_page(vma)) return true; - if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + if (!vma_has_recency(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) return true; if (vma == get_gate_vma(vma->vm_mm)) From 17e810229cb3068b692fa078bd9b3a6527e0866a Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 30 Dec 2022 14:52:52 -0700 Subject: [PATCH 125/505] mm: support POSIX_FADV_NOREUSE This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU algorithm can ignore access to mapped files marked by this flag. The advantages of POSIX_FADV_NOREUSE are: 1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the default readahead behavior. 2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and therefore does not take mmap_lock. 3. Unlike MADV_COLD, setting it has a negligible cost, regardless of how many pages it affects. Its limitations are: 1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does not support range. IOW, its scope is the entire file. 2. It currently does not ignore access through file descriptors. Specifically, for the active/inactive LRU, given a file page shared by two users and one of them having set POSIX_FADV_NOREUSE on the file, this page will be activated upon the second user accessing it. This corner case can be covered by checking POSIX_FADV_NOREUSE before calling folio_mark_accessed() on the read path. But it is considered not worth the effort. There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1]. This time the goal is to fill a niche: a few desktop applications, e.g., large file transferring and video encoding/decoding, want fast file streaming with mmap() rather than direct IO. Among those applications, an SVT-AV1 regression was reported when running with MGLRU [2]. The following test can reproduce that regression. kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo) kb=$((kb - 8*1024*1024)) modprobe brd rd_nr=1 rd_size=$kb dd if=/dev/zero of=/dev/ram0 bs=1M mkfs.ext4 /dev/ram0 mount /dev/ram0 /mnt/ swapoff -a fallocate -l 8G /mnt/swapfile mkswap /mnt/swapfile swapon /mnt/swapfile wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z 7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z SvtAv1EncApp --preset 12 -w 3840 -h 2160 \ -i /mnt/Bosphorus_3840x2160.y4m For MGLRU, the following change showed a [9-11]% increase in FPS, which makes it on par with the active/inactive LRU. patch Source/App/EncApp/EbAppMain.c < #include 35d35 < #include /* _O_BINARY */ 117a118 > posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE); EOF [1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/ [2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57 Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alexander Viro Cc: Andrea Righi Cc: Johannes Weiner Cc: Michael Larabel Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ include/linux/mm_inline.h | 3 +++ mm/fadvise.c | 5 ++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..d353c262d669 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_NOREUSE ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4abebf2615a3..26dcbda07e92 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -599,6 +599,9 @@ static inline bool vma_has_recency(struct vm_area_struct *vma) if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; + if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) + return false; + return true; } diff --git a/mm/fadvise.c b/mm/fadvise.c index bf04fec87f35..fb7c5f43fd2a 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; + file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE); spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: @@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: + spin_lock(&file->f_lock); + file->f_mode |= FMODE_NOREUSE; + spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: __filemap_fdatawrite_range(mapping, offset, endbyte, From 02d65d6fb1aae151570c8bfd1bd77a8153d2e607 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 6 Jan 2023 15:52:51 -0600 Subject: [PATCH 126/505] mm: introduce folio_is_pfmemalloc Add a folio equivalent for page_is_pfmemalloc. This removes two instances of page_is_pfmemalloc(folio_page(folio, 0)) so the folio can be used directly. Link: https://lkml.kernel.org/r/20230106215251.599222-1-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++++++++++ mm/slab.c | 2 +- mm/slub.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a8563359946..76c97cb8ee9a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1926,6 +1926,21 @@ static inline bool page_is_pfmemalloc(const struct page *page) return (uintptr_t)page->lru.next & BIT(1); } +/* + * Return true only if the folio has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool folio_is_pfmemalloc(const struct folio *folio) +{ + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)folio->lru.next & BIT(1); +} + /* * Only to be called by the page allocator on a freshly allocated * page. diff --git a/mm/slab.c b/mm/slab.c index 7a269db050ee..b77be9c6d6b1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1373,7 +1373,7 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, /* Make the flag visible before any changes to folio->mapping */ smp_wmb(); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) + if (sk_memalloc_socks() && folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); return slab; diff --git a/mm/slub.c b/mm/slub.c index 13459c69095a..67020074ecb4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1859,7 +1859,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, __folio_set_slab(folio); /* Make the flag visible before any changes to folio->mapping */ smp_wmb(); - if (page_is_pfmemalloc(folio_page(folio, 0))) + if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); return slab; From 61d3d5108eb621d2a097c41f6cc83bf63b1b6c03 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 6 Jan 2023 14:59:00 +0100 Subject: [PATCH 127/505] mm: remove PageMovable export The only in-kernel users that need PageMovable() to be exported are z3fold and zsmalloc and they are only using it for dubious debugging functionality. So remove those usages and the export so that no driver code accidentally thinks that they are allowed to use this symbol. Link: https://lkml.kernel.org/r/20230106135900.3763622-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reviewed-by: Sergey Senozhatsky Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Christoph Hellwig Acked-by: Minchan Kim Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - mm/z3fold.c | 2 -- mm/zsmalloc.c | 3 --- 3 files changed, 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index ca1603524bbe..62a61de44658 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -122,7 +122,6 @@ bool PageMovable(struct page *page) return false; } -EXPORT_SYMBOL(PageMovable); void __SetPageMovable(struct page *page, const struct movable_operations *mops) { diff --git a/mm/z3fold.c b/mm/z3fold.c index a4de0c317ac7..0cef845d397b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) struct z3fold_header *zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); if (test_bit(PAGE_HEADLESS, &page->private)) @@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page, struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..6aafacd664fc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1973,7 +1973,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); @@ -2005,7 +2004,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ @@ -2070,7 +2068,6 @@ static void zs_page_putback(struct page *page) { struct zspage *zspage; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); From fc8c7d2380ab7d6aa1ddef1f69169ef9a15596eb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:30 +0000 Subject: [PATCH 128/505] mm/damon/vaddr: rename 'damon_young_walk_private->page_sz' to 'folio_sz' Patch series "mm/damon/{v,p}addr: misc fixups for folio usage". DAMON's monitoring operations set for the virtual and the physical address spaces use folio now, but some code is not reflecting the fact. Further cleanup the code for folio usage. This patch (of 6): DAMON's virtual address space monitoring operations set is using folio now. Rename 'damon_pa_access_chk_result->page_sz' to reflect the fact. Link: https://lkml.kernel.org/r/20230109213335.62525-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230109213335.62525-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 9d92c5eb3a1f..d6cb1fca1769 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -422,7 +422,8 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) } struct damon_young_walk_private { - unsigned long *page_sz; + /* size of the folio for the access checked virtual memory address */ + unsigned long *folio_sz; bool young; }; @@ -452,7 +453,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = HPAGE_PMD_SIZE; + *priv->folio_sz = HPAGE_PMD_SIZE; priv->young = true; } folio_put(folio); @@ -474,7 +475,7 @@ regular_page: goto out; if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = PAGE_SIZE; + *priv->folio_sz = PAGE_SIZE; priv->young = true; } folio_put(folio); @@ -504,7 +505,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, if (pte_young(entry) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->page_sz = huge_page_size(h); + *priv->folio_sz = huge_page_size(h); priv->young = true; } @@ -524,10 +525,10 @@ static const struct mm_walk_ops damon_young_ops = { }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, - unsigned long *page_sz) + unsigned long *folio_sz) { struct damon_young_walk_private arg = { - .page_sz = page_sz, + .folio_sz = folio_sz, .young = false, }; @@ -547,18 +548,18 @@ static void __damon_va_check_access(struct mm_struct *mm, struct damon_region *r, bool same_target) { static unsigned long last_addr; - static unsigned long last_page_sz = PAGE_SIZE; + static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == - ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == + ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { if (last_accessed) r->nr_accesses++; return; } - last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); if (last_accessed) r->nr_accesses++; From 18fd73dbe5c39707b51552d622235e5c41e3d869 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:31 +0000 Subject: [PATCH 129/505] mm/damon/vaddr: support folio of neither HPAGE_PMD_SIZE nor PAGE_SIZE DAMON virtual address space monitoring operations set treats folios having non-HPAGE_PMD_SIZE size as having PAGE_SIZE size. Use the exact size of the folio. Link: https://lkml.kernel.org/r/20230109213335.62525-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index d6cb1fca1769..c7b192006fe6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -475,7 +475,7 @@ regular_page: goto out; if (pte_young(*pte) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = PAGE_SIZE; + *priv->folio_sz = folio_size(folio); priv->young = true; } folio_put(folio); From 7477d7560cb2c756d6a8ab165d9ed52537df54e7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:32 +0000 Subject: [PATCH 130/505] mm/damon/vaddr: record appropriate folio size when the access is not found DAMON virtual address spaces monitoring operations set doesn't set folio size of the access checked address if access is not found. It could result in unnecessary and inefficient repeated check. Appropriately set the size regardless of access check result. Link: https://lkml.kernel.org/r/20230109213335.62525-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c7b192006fe6..1fec16d7263e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -452,10 +452,9 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, goto huge_out; if (pmd_young(*pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, - addr)) { - *priv->folio_sz = HPAGE_PMD_SIZE; + addr)) priv->young = true; - } + *priv->folio_sz = HPAGE_PMD_SIZE; folio_put(folio); huge_out: spin_unlock(ptl); @@ -474,10 +473,9 @@ regular_page: if (!folio) goto out; if (pte_young(*pte) || !folio_test_idle(folio) || - mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = folio_size(folio); + mmu_notifier_test_young(walk->mm, addr)) priv->young = true; - } + *priv->folio_sz = folio_size(folio); folio_put(folio); out: pte_unmap_unlock(pte, ptl); @@ -504,10 +502,9 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, folio_get(folio); if (pte_young(entry) || !folio_test_idle(folio) || - mmu_notifier_test_young(walk->mm, addr)) { - *priv->folio_sz = huge_page_size(h); + mmu_notifier_test_young(walk->mm, addr)) priv->young = true; - } + *priv->folio_sz = huge_page_size(h); folio_put(folio); From af40e35a992ff5691166badd52a4fa2f940c2cea Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:33 +0000 Subject: [PATCH 131/505] mm/damon/paddr: rename 'damon_pa_access_chk_result->page_sz' to 'folio_sz' DAMON's physical address space monitoring operations set is using folio now. Rename 'damon_pa_access_chk_result->page_sz' to reflect the fact. Link: https://lkml.kernel.org/r/20230109213335.62525-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 99d4c357ef2b..65c1e0f91535 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -80,7 +80,8 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } struct damon_pa_access_chk_result { - unsigned long page_sz; + /* size of the folio for the access checked physical memory address */ + unsigned long folio_sz; bool accessed; }; @@ -91,7 +92,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); result->accessed = false; - result->page_sz = PAGE_SIZE; + result->folio_sz = PAGE_SIZE; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { @@ -103,7 +104,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, result->accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); - result->page_sz = HPAGE_PMD_SIZE; + result->folio_sz = HPAGE_PMD_SIZE; #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -118,11 +119,11 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, return !result->accessed; } -static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { - .page_sz = PAGE_SIZE, + .folio_sz = PAGE_SIZE, .accessed = false, }; struct rmap_walk_control rwc = { @@ -157,25 +158,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) folio_put(folio); out: - *page_sz = result.page_sz; + *folio_sz = result.folio_sz; return result.accessed; } static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; - static unsigned long last_page_sz = PAGE_SIZE; + static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (ALIGN_DOWN(last_addr, last_page_sz) == - ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (ALIGN_DOWN(last_addr, last_folio_sz) == + ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { if (last_accessed) r->nr_accesses++; return; } - last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); if (last_accessed) r->nr_accesses++; From 397b0c3a584b4176b4956b519ea3f1402d61fc4e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:34 +0000 Subject: [PATCH 132/505] mm/damon/paddr: remove folio_sz field from damon_pa_access_chk_result DAMON physical address space monitoring operations set gets and saves size of the folio for a given physical address inside rmap walks, but it can be directly caluclated outside of the walks. Remove the 'folio_sz' field from 'damon_pa_access_chk_result struct' and calculate the size directly from outside of the walks. Link: https://lkml.kernel.org/r/20230109213335.62525-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 65c1e0f91535..b51606519bbd 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -80,8 +80,6 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } struct damon_pa_access_chk_result { - /* size of the folio for the access checked physical memory address */ - unsigned long folio_sz; bool accessed; }; @@ -92,7 +90,6 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); result->accessed = false; - result->folio_sz = PAGE_SIZE; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { @@ -104,7 +101,6 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, result->accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); - result->folio_sz = HPAGE_PMD_SIZE; #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -123,7 +119,6 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct damon_pa_access_chk_result result = { - .folio_sz = PAGE_SIZE, .accessed = false, }; struct rmap_walk_control rwc = { @@ -158,7 +153,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) folio_put(folio); out: - *folio_sz = result.folio_sz; + *folio_sz = folio_size(folio); return result.accessed; } From b0c0e744e8a471f1c710197faaab0b81c461f8c0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Jan 2023 21:33:35 +0000 Subject: [PATCH 133/505] mm/damon/paddr: remove damon_pa_access_chk_result struct 'damon_pa_access_chk_result' struct contains only one field. Use a variable instead. Link: https://lkml.kernel.org/r/20230109213335.62525-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b51606519bbd..b4df9b9bcc0a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -79,50 +79,44 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -struct damon_pa_access_chk_result { - bool accessed; -}; - static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct damon_pa_access_chk_result *result = arg; + bool *accessed = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); - result->accessed = false; + *accessed = false; while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - result->accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(*pvmw.pte) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - result->accessed = pmd_young(*pvmw.pmd) || + *accessed = pmd_young(*pvmw.pmd) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else WARN_ON_ONCE(1); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } - if (result->accessed) { + if (*accessed) { page_vma_mapped_walk_done(&pvmw); break; } } /* If accessed, stop walking */ - return !result->accessed; + return *accessed == false; } static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); - struct damon_pa_access_chk_result result = { - .accessed = false, - }; + bool accessed = false; struct rmap_walk_control rwc = { - .arg = &result, + .arg = &accessed, .rmap_one = __damon_pa_young, .anon_lock = folio_lock_anon_vma_read, }; @@ -133,9 +127,9 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) - result.accessed = false; + accessed = false; else - result.accessed = true; + accessed = true; folio_put(folio); goto out; } @@ -154,7 +148,7 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) out: *folio_sz = folio_size(folio); - return result.accessed; + return accessed; } static void __damon_pa_check_access(struct damon_region *r) From c4876ff68716e5372224d17045b47610d667a0ee Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Mon, 9 Jan 2023 17:43:32 +0000 Subject: [PATCH 134/505] mm/debug: use valid physical memory for pmd/pud tests The page table debug tests need a physical address to validate low-level page table manipulation with. The memory at this address is not actually touched, it just encoded in the page table entries at various levels during the tests only. Since the memory is not used, the code just picks the physical address of the start_kernel symbol. This value is then truncated to get a properly aligned address that is to be used for various tests. Because of the truncation, the address might not actually exist, or might not describe a complete huge page. That's not a problem for most tests, but the arch-specific code may check for attribute validity and consistency. The x86 version of {pud,pmd}_set_huge actually validates the MTRRs for the PMD/PUD range. This may fail with an address derived from start_kernel, depending on where the kernel was loaded and what the physical memory layout of the system is. This then leads to false negatives for the {pud,pmd}_set_huge tests. Avoid this by finding a properly aligned memory range that exists and is usable. If such a range is not found, skip the tests that needed it. [fvdl@google.com: v3] Link: https://lkml.kernel.org/r/20230110181208.1633879-1-fvdl@google.com Link: https://lkml.kernel.org/r/20230109174332.329366-1-fvdl@google.com Fixes: 399145f9eb6c ("mm/debug: add tests validating architecture page table helpers") Signed-off-by: Frank van der Linden Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 102 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c631ade3f1d2..bb3328f46126 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ struct pgtable_debug_args { unsigned long pmd_pfn; unsigned long pte_pfn; + unsigned long fixed_alignment; unsigned long fixed_pgd_pfn; unsigned long fixed_p4d_pfn; unsigned long fixed_pud_pfn; @@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(args->page_prot)) + if (!arch_vmap_pmd_supported(args->page_prot) || + args->fixed_alignment < PMD_SIZE) return; pr_debug("Validating PMD huge\n"); @@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(args->page_prot)) + if (!arch_vmap_pud_supported(args->page_prot) || + args->fixed_alignment < PUD_SIZE) return; pr_debug("Validating PUD huge\n"); @@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) return page; } +/* + * Check if a physical memory range described by contains + * an area that is of size psize, and aligned to psize. + * + * Don't use address 0, an all-zeroes physical address might mask bugs, and + * it's not used on x86. + */ +static void __init phys_align_check(phys_addr_t pstart, + phys_addr_t pend, unsigned long psize, + phys_addr_t *physp, unsigned long *alignp) +{ + phys_addr_t aligned_start, aligned_end; + + if (pstart == 0) + pstart = PAGE_SIZE; + + aligned_start = ALIGN(pstart, psize); + aligned_end = aligned_start + psize; + + if (aligned_end > aligned_start && aligned_end <= pend) { + *alignp = psize; + *physp = aligned_start; + } +} + +static void __init init_fixed_pfns(struct pgtable_debug_args *args) +{ + u64 idx; + phys_addr_t phys, pstart, pend; + + /* + * Initialize the fixed pfns. To do this, try to find a + * valid physical range, preferably aligned to PUD_SIZE, + * but settling for aligned to PMD_SIZE as a fallback. If + * neither of those is found, use the physical address of + * the start_kernel symbol. + * + * The memory doesn't need to be allocated, it just needs to exist + * as usable memory. It won't be touched. + * + * The alignment is recorded, and can be checked to see if we + * can run the tests that require an actual valid physical + * address range on some architectures ({pmd,pud}_huge_test + * on x86). + */ + + phys = __pa_symbol(&start_kernel); + args->fixed_alignment = PAGE_SIZE; + + for_each_mem_range(idx, &pstart, &pend) { + /* First check for a PUD-aligned area */ + phys_align_check(pstart, pend, PUD_SIZE, &phys, + &args->fixed_alignment); + + /* If a PUD-aligned area is found, we're done */ + if (args->fixed_alignment == PUD_SIZE) + break; + + /* + * If no PMD-aligned area found yet, check for one, + * but continue the loop to look for a PUD-aligned area. + */ + if (args->fixed_alignment < PMD_SIZE) + phys_align_check(pstart, pend, PMD_SIZE, &phys, + &args->fixed_alignment); + } + + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); +} + + static int __init init_args(struct pgtable_debug_args *args) { struct page *page = NULL; - phys_addr_t phys; int ret = 0; /* @@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); WARN_ON(!args->start_ptep); - /* - * PFN for mapping at PTE level is determined from a standard kernel - * text symbol. But pfns for higher page table levels are derived by - * masking lower bits of this real pfn. These derived pfns might not - * exist on the platform but that does not really matter as pfn_pxx() - * helpers will still create appropriate entries for the test. This - * helps avoid large memory block allocations to be used for mapping - * at higher page table levels in some of the tests. - */ - phys = __pa_symbol(&start_kernel); - args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); - args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); - args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); - args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); - args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); - WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + init_fixed_pfns(args); /* * Allocate (huge) pages because some of the tests need to access From f4d9139f1394cbe2de158ab8771fea4e587004d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 Jan 2023 18:12:55 +0100 Subject: [PATCH 135/505] selftests/mm: define MADV_PAGEOUT to fix compilation issues If MADV_PAGEOUT is not defined (e.g., on AlmaLinux 8), compilation will fail. Let's fix that like khugepaged.c does by conditionally defining MADV_PAGEOUT. Link: https://lkml.kernel.org/r/20230109171255.488749-1-david@redhat.com Fixes: 69c66add5663 ("selftests/vm: anon_cow: test COW handling of anonymous memory") Signed-off-by: David Hildenbrand Reported-by: Mirsad Goran Todorovac Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 16216d893d96..0eb2e8180aa5 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -30,6 +30,9 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif From e8dfc854eef20ac7663996f61837299887f380fc Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 7 Dec 2022 10:10:09 -0800 Subject: [PATCH 136/505] ext4: convert mext_page_double_lock() to mext_folio_double_lock() Convert mext_page_double_lock() to use folios. This change saves 146 bytes of kernel text. It also removes 6 calls to compound_head() and 2 calls to folio_file_page(). Link: https://lkml.kernel.org/r/20221207181009.4016-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 46 +++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8dbb87edf24c..2de9829aed63 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -110,22 +110,23 @@ out: } /** - * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 + * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index1: page index - * @index2: page index - * @page: result page vector + * @index1: folio index + * @index2: folio index + * @folio: result folio vector * - * Grab two locked pages for inode's by inode order + * Grab two locked folio for inode's by inode order */ static int -mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index1, pgoff_t index2, struct page *page[2]) +mext_folio_double_lock(struct inode *inode1, struct inode *inode2, + pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -138,28 +139,30 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - page[0] = grab_cache_page_write_begin(mapping[0], index1); - if (!page[0]) { + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + mapping_gfp_mask(mapping[0])); + if (!folio[0]) { memalloc_nofs_restore(flags); return -ENOMEM; } - page[1] = grab_cache_page_write_begin(mapping[1], index2); + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!page[1]) { - unlock_page(page[0]); - put_page(page[0]); + if (!folio[1]) { + folio_unlock(folio[0]); + folio_put(folio[0]); return -ENOMEM; } /* - * grab_cache_page_write_begin() may not wait on page's writeback if + * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative - * here and explicitly wait on page's writeback + * here and explicitly wait on folio's writeback */ - wait_on_page_writeback(page[0]); - wait_on_page_writeback(page[1]); + folio_wait_writeback(folio[0]); + folio_wait_writeback(folio[1]); if (inode1 > inode2) - swap(page[0], page[1]); + swap(folio[0], folio[1]); return 0; } @@ -252,7 +255,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); - struct page *pagep[2] = {NULL, NULL}; struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; @@ -303,8 +305,8 @@ again: replaced_size = data_size; - *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - donor_page_offset, pagep); + *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, + donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* @@ -314,8 +316,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - folio[0] = page_folio(pagep[0]); - folio[1] = page_folio(pagep[1]); VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); From b6f00c9190c8e694c9b2b38e7cc63964f7a99195 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Mon, 9 Jan 2023 19:46:55 +0800 Subject: [PATCH 137/505] mm/damon/sysfs-schemes: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL-terminated strings. Link: https://lkml.kernel.org/r/202301091946553770006@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: Yang Yang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index f0dabe3e2dc0..86edca66aab1 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -353,8 +353,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, if (!path) return -ENOMEM; - strncpy(path, buf, count); - path[count] = '\0'; + strscpy(path, buf, count + 1); filter->memcg_path = path; return count; } From d526643f155c431e8dfef643195f2d636d4e4bb5 Mon Sep 17 00:00:00 2001 From: Alexander Pantyukhin Date: Sun, 8 Jan 2023 15:50:23 +0500 Subject: [PATCH 138/505] tools:cgroup:memcg_shrinker remove redundant import Remove redundant import of the sys module. Also use the sort function instead of sorted. It sorts the direct array without create the new one in memory. Link: https://lkml.kernel.org/r/20230108105023.4289-1-apantykhin@gmail.com Signed-off-by: Alexander Pantyukhin Cc: Roman Gushchin Signed-off-by: Andrew Morton --- tools/cgroup/memcg_shrinker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py index 706ab27666a4..e81c3017ada9 100644 --- a/tools/cgroup/memcg_shrinker.py +++ b/tools/cgroup/memcg_shrinker.py @@ -5,7 +5,6 @@ import os import argparse -import sys def scan_cgroups(cgroup_root): @@ -44,7 +43,7 @@ def main(): cgroups = scan_cgroups("/sys/fs/cgroup/") shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/") - shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0]) + shrinkers.sort(reverse = True, key = lambda x: x[0]) n = 0 for s in shrinkers: From 9a3f21fe5cb9f5654ccad7ba712d868f7de66e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 9 Jan 2023 12:42:51 +0100 Subject: [PATCH 139/505] selftests: vm: enable cross-compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selftests vm builds break when doing cross-compilation. The Makefile MACHINE variable incorrectly picks upp the host machine architecture. If the CROSS_COMPILE variable is set, dig out the target host architecture from CROSS_COMPILE, instead of calling uname. Link: https://lkml.kernel.org/r/20230109114251.3349638-1-bjorn@kernel.org Signed-off-by: Björn Töpel Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 6a4b639b2b2b..0a44d77f8437 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,7 +5,11 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) +else +uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') +endif MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') # Without this, failed build products remain, with up-to-date timestamps, From 92b64bd01fe99325ba0f51125bcb991f1566eadc Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 7 Dec 2022 23:53:08 +0100 Subject: [PATCH 140/505] mm/highmem: add notes about conversions from kmap{,_atomic}() kmap() and kmap_atomic() have been deprecated. kmap_local_page() should always be used in new code and the call sites of the two deprecated functions should be converted. This latter task can lead to errors if it is not carried out with the necessary attention to the context around and between the maps and unmaps. Therefore, add further information to the Highmem's documentation for the purpose to make it clearer that (1) kmap() and kmap_atomic() must not any longer be called in new code and (2) developers doing conversions from kmap() amd kmap_atomic() are expected to take care of the context around and between the maps and unmaps, in order to not break the code. Relevant parts of this patch have been taken from messages exchanged privately with Ira Weiny (thanks!). [fmdefrancesco@gmail.com: merge two sentences into one, per Bagas] Link: https://lkml.kernel.org/r/20230119123945.10471-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20221207225308.8290-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Cc: Ira Weiny Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- Documentation/mm/highmem.rst | 41 +++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index 0f731d9196b0..e691a06fb337 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -57,7 +57,8 @@ list shows them in order of preference of use. It can be invoked from any context (including interrupts) but the mappings can only be used in the context which acquired them. - This function should be preferred, where feasible, over all the others. + This function should always be used, whereas kmap_atomic() and kmap() have + been deprecated. These mappings are thread-local and CPU-local, meaning that the mapping can only be accessed from within this thread and the thread is bound to the @@ -100,10 +101,21 @@ list shows them in order of preference of use. (included in the "Functions" section) for details on how to manage nested mappings. -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. +* kmap_atomic(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). Furthermore, the code between + calls to kmap_atomic() and kunmap_atomic() may implicitly depend on the side + effects of atomic mappings, i.e. disabling page faults or preemption, or both. + In that case, explicit calls to pagefault_disable() or preempt_disable() or + both must be made in conjunction with the use of kmap_local_page(). + + [Legacy documentation] + + This permits a very short duration mapping of a single page. Since the + mapping is restricted to the CPU that issued it, it performs well, but + the issuing task is therefore required to stay on that CPU until it has + finished, lest some other task displace its mappings. kmap_atomic() may also be used by interrupt contexts, since it does not sleep and the callers too may not sleep until after kunmap_atomic() is @@ -115,11 +127,20 @@ list shows them in order of preference of use. It is assumed that k[un]map_atomic() won't fail. -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). +* kmap(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). In particular, it is necessary to + make sure that the kernel virtual memory pointer is only valid in the thread + that obtained it. + + [Legacy documentation] + + This should be used to make short duration mapping of a single page with no + restrictions on preemption or migration. It comes with an overhead as mapping + space is restricted and protected by a global lock for synchronization. When + mapping is no longer needed, the address that the page was mapped to must be + released with kunmap(). Mapping changes must be propagated across all the CPUs. kmap() also requires global TLB invalidation when the kmap's pool wraps and it might From fb6f026b833a71f4701e12b43800e46d7351f7a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:53 +0000 Subject: [PATCH 141/505] mm/damon/core: update kernel-doc comments for DAMOS action supports of each DAMON operations set Patch series "mm/damon: trivial fixups". This patchset contains patches for trivial fixups of DAMON's documentation, MAINTAINERS section, and selftests. This patch (of 8): Supports of each DAMOS action are up to DAMON operations set implementation in use, but not well mentioned on the kernel-doc comments. Add the comment. Link: https://lkml.kernel.org/r/20230110190400.119388-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230110190400.119388-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7907918ad2e0..3fa96d7c9fe4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,6 +91,12 @@ struct damon_target { * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions + * + * The support of each action is up to running &struct damon_operations. + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except + * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR + * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum + * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. */ enum damos_action { DAMOS_WILLNEED, From 55901e89d2864b5ef9961892470eedf29279d412 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:54 +0000 Subject: [PATCH 142/505] mm/damon/core: update kernel-doc comments for DAMOS filters supports of each DAMON operations set Supports of each DAMOS filter type are up to DAMON operations set implementation in use, but not well mentioned on the kernel-doc comments. Add the comment. Link: https://lkml.kernel.org/r/20230110190400.119388-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3fa96d7c9fe4..dfb245bb3053 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -227,6 +227,11 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @NR_DAMOS_FILTER_TYPES: Number of filter types. + * + * The support of each filter type is up to running &struct damon_operations. + * &enum DAMON_OPS_PADDR is supporting all filter types, while + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any + * filter types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, From 86834644e3c9301ccd28df1293c37306a6332f3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:55 +0000 Subject: [PATCH 143/505] Docs/mm/damon/index: mention DAMOS on the intro What DAMON aims to do is not only access monitoring but efficient and effective access-aware system operations. And DAMon-based Operation Schemes (DAMOS) is the important feature of DAMON for the goal. Make the intro of DAMON documentation to emphasize the goal and mention DAMOS. Link: https://lkml.kernel.org/r/20230110190400.119388-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 48c0bbff98b2..2983699c12ea 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -4,8 +4,9 @@ DAMON: Data Access MONitor ========================== -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it +DAMON is a Linux kernel subsystem that provides a framework for data access +monitoring and the monitoring results based system operations. The core +monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *accurate* (the monitoring output is useful enough for DRAM level memory management; It might not appropriate for CPU Cache levels, though), @@ -14,12 +15,16 @@ The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *scalable* (the upper-bound of the overhead is in constant range regardless of the size of target workloads). -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. +Using this framework, therefore, the kernel can operate system in an +access-aware fashion. Because the features are also exposed to the user space, +users who have special information about their workloads can write personalized +applications for better understanding and optimizations of their workloads and +systems. + +For easier development of such systems, DAMON provides a feature called DAMOS +(DAMon-based Operation Schemes) in addition to the monitoring. Using the +feature, DAMON users in both kernel and user spaces can do access-aware system +operations with no code but simple configurations. .. toctree:: :maxdepth: 2 From 9a47c411ccddf843812dbbff707bd45e3bc83f40 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:56 +0000 Subject: [PATCH 144/505] Docs/admin-guide/mm/damon/usage: update DAMOS actions/filters supports of each DAMON operations set Supports of each DAMOS action and filters are up to DAMON operations set implementation, but it's not mentioned in detail on the documentation. Update the information on the usage document. Link: https://lkml.kernel.org/r/20230110190400.119388-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 41 +++++++++++++------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 3d82ca6a17ff..9237d6a25897 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -279,14 +279,25 @@ The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords that can be written to and read from the file and their meaning are as below. - - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD`` - - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` +Note that support of each action depends on the running DAMON operations set +`implementation `. + + - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. + - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. - ``lru_prio``: Prioritize the region on its LRU lists. + Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. - - ``stat``: Do nothing but count the statistics + Supported by ``paddr`` operations set. + - ``stat``: Do nothing but count the statistics. + Supported by all operations sets. schemes//access_pattern/ --------------------------- @@ -388,8 +399,8 @@ pages of all memory cgroups except ``/having_care_already``.:: echo /having_care_already > 1/memcg_path echo N > 1/matching -Note that filters could be ignored depend on the running DAMON operations set -`implementation `. +Note that filters are currently supported only when ``paddr`` +`implementation ` is being used. .. _sysfs_schemes_stats: @@ -618,11 +629,15 @@ The ```` is a predefined integer for memory management actions, which DAMON will apply to the regions having the target access pattern. The supported numbers and their meanings are as below. - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if + ``target`` is ``paddr``. + - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if + ``target`` is ``paddr``. + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if + ``target`` is ``paddr``. + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if + ``target`` is ``paddr``. - 5: Do nothing but count the statistics Quota From e7366f3a2ed0f554354a1d7fe8bf5d98bab5247e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:57 +0000 Subject: [PATCH 145/505] Docs/mm/damon: add a maintainer-profile for DAMON Document the basic policies and expectations for DAMON development. Link: https://lkml.kernel.org/r/20230110190400.119388-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 1 + Documentation/mm/damon/maintainer-profile.rst | 62 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 Documentation/mm/damon/maintainer-profile.rst diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 2983699c12ea..5e0a50583500 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -32,3 +32,4 @@ operations with no code but simple configurations. faq design api + maintainer-profile diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst new file mode 100644 index 000000000000..24a202f03de8 --- /dev/null +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +DAMON Maintainer Entry Profile +============================== + +The DAMON subsystem covers the files that listed in 'DATA ACCESS MONITOR' +section of 'MAINTAINERS' file. + +The mailing lists for the subsystem are damon@lists.linux.dev and +linux-mm@kvack.org. Patches should be made against the mm-unstable tree [1]_ +whenever possible and posted to the mailing lists. + +SCM Trees +--------- + +There are multiple Linux trees for DAMON development. Patches under +development or testing are queued in damon/next [2]_ by the DAMON maintainer. +Suffieicntly reviewed patches will be queued in mm-unstable [1]_ by the memory +management subsystem maintainer. After more sufficient tests, the patches will +be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the +memory management subsystem maintainer. + +Note again the patches for review should be made against the mm-unstable +tree[1] whenever possible. damon/next is only for preview of others' works in +progress. + +Submit checklist addendum +------------------------- + +When making DAMON changes, you should do below. + +- Build changes related outputs including kernel and documents. +- Ensure the builds introduce no new errors or warnings. +- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ . + +Further doing below and putting the results will be helpful. + +- Run damon-tests/corr [6]_ for normal changes. +- Run damon-tests/perf [7]_ for performance changes. + +Key cycle dates +--------------- + +Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and +mm-stable[3] trees depend on the memory management subsystem maintainer. + +Review cadence +-------------- + +The DAMON maintainer does the work on the usual work hour (09:00 to 17:00, +Mon-Fri) in PST. The response to patches will occasionally be slow. Do not +hesitate to send a ping if you have not heard back within a week of sending a +patch. + + +.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable +.. [2] https://git.kernel.org/sj/h/damon/next +.. [3] https://git.kernel.org/akpm/mm/h/mm-stable +.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49 +.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh +.. [6] https://github.com/awslabs/damon-tests/tree/master/corr +.. [7] https://github.com/awslabs/damon-tests/tree/master/perf From 2d2230efbcecda7747a2bee659eae474f504ef42 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:58 +0000 Subject: [PATCH 146/505] MAINTAINERS/DAMON: link maintainer profile, git trees, and website Add links to below DAMON development related resource to DAMON section in MAINTAINERS file. - The basic policies and expectations of DAMON development, - DAMON development trees, and - DAMON introduction website. Link: https://lkml.kernel.org/r/20230110190400.119388-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8ac1472bea34..b92a2a0cb36b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5790,6 +5790,11 @@ M: SeongJae Park L: damon@lists.linux.dev L: linux-mm@kvack.org S: Maintained +W: https://damonitor.github.io +P: Documentation/mm/damon/maintainer-profile.rst +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git damon/next F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ F: Documentation/mm/damon/ From 16ddcb15497e11a2695c604357e77140010d3d51 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:03:59 +0000 Subject: [PATCH 147/505] selftests/damon/sysfs: hide expected write failures DAMON selftests for sysfs (sysfs.sh) tests if some writes to DAMON sysfs interface files fails as expected. It makes the test results noisy with the failure error message because it tests a number of such failures. Redirect the expected failure error messages to /dev/null to make the results clean. Link: https://lkml.kernel.org/r/20230110190400.119388-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index a00336ffdcad..bcd4734ca094 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -24,7 +24,7 @@ ensure_write_fail() content=$2 reason=$3 - if echo "$content" > "$file" + if (echo "$content" > "$file") 2> /dev/null then echo "writing $content to $file succeed ($fail_reason)" echo "expected failure because $reason" From 75cb348714f527ce2de3446202b76ce74808b668 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Jan 2023 19:04:00 +0000 Subject: [PATCH 148/505] selftests/damon/debugfs_rm_non_contexts: hide expected write error messages A selftest case for DAMON debugfs interface has a test for expected failure. To make the test output clean, hide the expected failure error message. Link: https://lkml.kernel.org/r/20230110190400.119388-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/debugfs_rm_non_contexts.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh index 48b7af6b022c..f3ffeb1343cf 100644 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -10,7 +10,7 @@ dmesg -C for file in "$DBGFS/"* do - echo "$(basename "$f")" > "$DBGFS/rm_contexts" + (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null if dmesg | grep -q BUG then dmesg From c5d5546ea06512accc894cd19265c7041a6ac81a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Tue, 10 Jan 2023 23:42:11 +0800 Subject: [PATCH 149/505] maple_tree: remove the parameter entry of mas_preallocate The parameter entry of mas_preallocate is not used, so drop it. Link: https://lkml.kernel.org/r/20230110154211.1758562-1-vernon2gm@gmail.com Signed-off-by: Vernon Yang Cc: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 3 +-- mm/mmap.c | 16 ++++++++-------- mm/nommu.c | 8 ++++---- tools/testing/radix-tree/maple.c | 32 ++++++++++++++++---------------- 5 files changed, 30 insertions(+), 31 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 815a27661517..a7bf58fd7cc6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,7 +455,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index baff62a012e1..5be99550e36d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5700,12 +5700,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state - * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, gfp_t gfp) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index 425a9349e610..4fe29b8f99b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -472,7 +472,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; if (vma->vm_file) { @@ -538,7 +538,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -712,7 +712,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -1938,7 +1938,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2019,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2311,7 +2311,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) return -ENOMEM; mas->last = end - 1; @@ -2680,7 +2680,7 @@ cannot_expand: goto free_vma; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, GFP_KERNEL)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); diff --git a/mm/nommu.c b/mm/nommu.c index df1711acdf5b..0481922fe66e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -602,7 +602,7 @@ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -633,7 +633,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -1091,7 +1091,7 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) goto error_maple_preallocate; region->vm_usage = 1; @@ -1369,7 +1369,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (mas_preallocate(&mas, GFP_KERNEL)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); goto err_mas_preallocate; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 1f36bc1c5d36..958ee9bdb316 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35342,7 +35342,7 @@ static noinline void check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35351,18 +35351,18 @@ static noinline void check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35370,25 +35370,25 @@ static noinline void check_prealloc(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35397,12 +35397,12 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35410,21 +35410,21 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35432,14 +35432,14 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35447,7 +35447,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); From baabcfc93d3b5b14c52eb5c18d38627c32d2d82b Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Tue, 10 Jan 2023 22:53:53 +0800 Subject: [PATCH 150/505] mm/mmap: fix typo in comment Replace "parital" with "partial". Link: https://lkml.kernel.org/r/20230110145353.1658435-1-vernon2gm@gmail.com Signed-off-by: Vernon Yang Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4fe29b8f99b0..0641e6e0016c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2889,7 +2889,7 @@ out: } /* - * brk_munmap() - Unmap a parital vma. + * brk_munmap() - Unmap a partial vma. * @mas: The maple tree state. * @vma: The vma to be modified * @newbrk: the start of the address to unmap From c6835e8d86bcd8313347e097da140057772307c0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:18 +0800 Subject: [PATCH 151/505] mm: compaction: remove redundant VM_BUG_ON() in compact_zone() Patch series "Some small improvements for compaction". When I did some compaction testing, I found some small room for improvement as well as some code cleanups. This patch (of 5): The compaction_suitable() will never return values other than COMPACT_SUCCESS, COMPACT_SKIPPED and COMPACT_CONTINUE, so after validation of COMPACT_SUCCESS and COMPACT_SKIPPED, we will never hit other unexpected case. Thus remove the redundant VM_BUG_ON() validation for the return values of compaction_suitable(). Link: https://lkml.kernel.org/r/cover.1673342761.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/740a2396d9b98154dba76e326cba5e798b640ead.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 62a61de44658..5e6f5e35748d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2313,9 +2313,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; - /* huh, compaction_suitable is returning something unexpected */ - VM_BUG_ON(ret != COMPACT_CONTINUE); - /* * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. From 753ec50d976c28b08266dec3110905b377464eb1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:19 +0800 Subject: [PATCH 152/505] mm: compaction: move list validation into compact_zone() Move the cc.freepages and cc.migratepages list validation into compact_zone() to remove some duplicate code. Link: https://lkml.kernel.org/r/15cf54f7d762e87b04ac3cc74536f7d1ebbcd8cd.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 5e6f5e35748d..f8e8addc8664 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2488,6 +2488,9 @@ out: trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); + VM_BUG_ON(!list_empty(&cc->freepages)); + VM_BUG_ON(!list_empty(&cc->migratepages)); + return ret; } @@ -2526,9 +2529,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, ret = compact_zone(&cc, &capc); - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); - /* * Make sure we hide capture control first before we read the captured * page pointer, otherwise an interrupt could free and capture a page @@ -2659,9 +2659,6 @@ static void proactive_compact_node(pg_data_t *pgdat) cc.zone = zone; compact_zone(&cc, NULL); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } } @@ -2689,9 +2686,6 @@ static void compact_node(int nid) cc.zone = zone; compact_zone(&cc, NULL); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } } @@ -2868,9 +2862,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.total_migrate_scanned); count_compact_events(KCOMPACTD_FREE_SCANNED, cc.total_free_scanned); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } /* From 1bfb7684db1233d9e3f3f26fbbc0c58d40ff65e7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:20 +0800 Subject: [PATCH 153/505] mm: compaction: count the migration scanned pages events for proactive compaction The proactive compaction will reuse per-node kcompactd threads, so we should also count the KCOMPACTD_MIGRATE_SCANNED and KCOMPACTD_FREE_SCANNED events for proactive compaction. Link: https://lkml.kernel.org/r/b7f1ece1adc17defa47e3667b5f9fd61f496517a.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index f8e8addc8664..62f6bb68c9cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2659,6 +2659,11 @@ static void proactive_compact_node(pg_data_t *pgdat) cc.zone = zone; compact_zone(&cc, NULL); + + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); } } From 8fff8b6f8d0ef7620e06f3f4cfb912171aef6cd5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:21 +0800 Subject: [PATCH 154/505] mm: compaction: add missing kcompactd wakeup trace event Add missing kcompactd wakeup trace event for proactive compaction, meanwhile use order = -1 and the highest zone index of the pgdat for the kcompactd wakeup trace event by proactive compaction. Link: https://lkml.kernel.org/r/cbf8097a2d8a1b6800991f2a21575550d3613ce6.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 62f6bb68c9cb..0fd6c81a7809 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2730,6 +2730,8 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, continue; pgdat->proactive_compact_trigger = true; + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1, + pgdat->nr_zones - 1); wake_up_interruptible(&pgdat->kcompactd_wait); } } From 9e5522715e6941bcfdc08d066a79d6da0f8cec8e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 10 Jan 2023 21:36:22 +0800 Subject: [PATCH 155/505] mm: compaction: avoid fragmentation score calculation for empty zones There is no need to calculate the fragmentation score for empty zones. Link: https://lkml.kernel.org/r/100331ad9d274a9725e687b00d85d75d7e4a17c7.1673342761.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 0fd6c81a7809..b758b00a4885 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2025,6 +2025,8 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) struct zone *zone; zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; score += fragmentation_score_zone_weighted(zone); } From 7d4a8be0c4b2b7ffb367929d2b352651f083806b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 10 Jan 2023 13:57:22 +1100 Subject: [PATCH 156/505] mm/mmu_notifier: remove unused mmu_notifier_range_update_to_read_only export mmu_notifier_range_update_to_read_only() was originally introduced in commit c6d23413f81b ("mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper") as an optimisation for device drivers that know a range has only been mapped read-only. However there are no users of this feature so remove it. As it is the only user of the struct mmu_notifier_range.vma field remove that also. Link: https://lkml.kernel.org/r/20230110025722.600912-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Mike Rapoport (IBM) Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Mike Kravetz Cc: Ira Weiny Cc: Jerome Glisse Cc: John Hubbard Cc: Ralph Campbell Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/mmu_notifier.h | 13 +++++-------- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 12 ++++++------ mm/khugepaged.c | 6 +++--- mm/ksm.c | 5 ++--- mm/madvise.c | 2 +- mm/mapping_dirty_helpers.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 4 ++-- mm/mmu_notifier.c | 10 ---------- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/oom_kill.c | 2 +- mm/rmap.c | 11 +++++------ 16 files changed, 37 insertions(+), 52 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index af1c49ae11b1..a44339a77a75 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1306,7 +1306,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e140277..64a3e051c3c4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { - struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; @@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, - struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { - range->vma = vma; range->event = event; range->mm = mm; range->start = start; @@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, - struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end, void *owner) + struct mm_struct *mm, unsigned long start, + unsigned long end, void *owner) { - mmu_notifier_range_init(range, event, flags, vma, mm, start, end); + mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } @@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ +#define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) -#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ +#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c..29f36d2ae129 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7e68a36b4f7d..c13b1f67d14e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2020,7 +2020,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2282,7 +2282,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fe65f14d33b..273a6522aa4c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4966,7 +4966,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -5177,7 +5177,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mmu_notifier_range range; bool shared_pmd = false; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* @@ -5391,7 +5391,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); @@ -5597,7 +5597,7 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -6637,7 +6637,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, - 0, vma, mm, start, end); + 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -7368,7 +7368,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90acfea40c13..57164c15e076 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1040,8 +1040,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_lock_write(vma->anon_vma); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1412,7 +1412,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); diff --git a/mm/ksm.c b/mm/ksm.c index dd02780c387f..cea0c4478220 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1057,8 +1057,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1164,7 +1163,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/madvise.c b/mm/madvise.c index e407d335e614..5296e78dccda 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -765,7 +765,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 175e424b9ab1..e1eb33f49059 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -191,7 +191,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end, wpwalk->tlbflush_end = start; mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, - walk->vma, walk->mm, start, end); + walk->mm, start, end); mmu_notifier_invalidate_range_start(&wpwalk->range); flush_cache_range(walk->vma, start, end); diff --git a/mm/memory.c b/mm/memory.c index 90f8f72777c7..c6bacd58d032 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1266,7 +1266,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, src_vma, src_mm, addr, end); + 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as @@ -1683,7 +1683,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, }; MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { @@ -1709,7 +1709,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); if (is_vm_hugetlb_page(vma)) adjust_range_if_pmd_sharing_possible(vma, &range.start, @@ -3091,7 +3091,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -3561,7 +3561,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 721b2365dbca..6c3740318a98 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -306,7 +306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * private page mappings that won't be migrated. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->vma->vm_mm, migrate->start, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); @@ -733,7 +733,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, notified = true; mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, + MMU_NOTIFY_MIGRATE, 0, migrate->vma->vm_mm, addr, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f45ff1b7626a..50c0dde1354f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1120,13 +1120,3 @@ void mmu_notifier_synchronize(void) synchronize_srcu(&srcu); } EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - -bool -mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) -{ - if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) - return false; - /* Return true if the vma still have the read flag set. */ - return range->vma->vm_flags & VM_READ; -} -EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92fc6f3fa512..6ecdf0671b81 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -398,7 +398,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - vma, vma->vm_mm, addr, end); + vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index 930f65c315c0..05f90f47e149 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -498,7 +498,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_addr, len); flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..044e1eed720e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -542,7 +542,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, mm, vma->vm_start, + mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index ab74e0547a52..6ccd42bbae93 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -944,9 +944,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, vma->vm_mm, address, - vma_address_end(pvmw)); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { @@ -1475,7 +1474,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -1850,7 +1849,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -2180,7 +2179,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, swp_entry_t entry; pte_t swp_pte; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); From 94688e8eb453e616098cb930e5f6fed4a6ea2dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:47 +0000 Subject: [PATCH 157/505] mm: remove folio_pincount_ptr() and head_compound_pincount() We can use folio->_pincount directly, since all users are guarded by tests of compound/large. Link: https://lkml.kernel.org/r/20230111142915.1001531-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 25 +++++++++++------------ include/linux/mm.h | 14 ++----------- include/linux/mm_types.h | 5 ----- mm/debug.c | 4 ++-- mm/gup.c | 8 ++++---- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 9 +++++--- 8 files changed, 30 insertions(+), 43 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index facafbdecb95..9fb0b1080d3b 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, -an exact form of pin counting is achieved, by using the 2nd struct page -in the compound page. A new struct page field, compound_pincount, has -been added in order to support this. +For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +the extra space available in the struct folio is used to store the +pincount directly. -This approach for compound pages avoids the counting upper limit problems that -are discussed below. Those limitations would have been aggravated severely by -huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate compound_pincount field, -page overflows were seen in some huge page stress tests. +This approach for large folios avoids the counting upper limit problems +that are discussed below. Those limitations would have been aggravated +severely by huge pages, because each tail page adds a refcount to the +head page. And in fact, testing revealed that, without a separate pincount +field, refcount overflows were seen in some huge page stress tests. -This also means that huge pages and compound pages do not suffer +This also means that huge pages and large folios do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +263,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting -fields, and to better report on compound pages in general. Specifically, -for compound pages, the exact (compound_pincount) pincount is reported. +dump_page() has been enhanced slightly to handle these new counting +fields, and to better report on large folios in general. Specifically, +for large folios, the exact pincount is reported. References ========== diff --git a/include/linux/mm.h b/include/linux/mm.h index 76c97cb8ee9a..6d3945207067 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,6 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; @@ -1641,11 +1636,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1663,7 +1653,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1674,7 +1664,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10b6eb311ede..6ff1d7db00a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -443,11 +443,6 @@ static inline atomic_t *subpages_mapcount_ptr(struct page *page) return &page[1].subpages_mapcount; } -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/debug.c b/mm/debug.c index 7f8e5f744e42..893c9dbf76ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,11 +94,11 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), head_subpages_mapcount(head), - head_compound_pincount(head)); + atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG diff --git a/mm/gup.c b/mm/gup.c index f45a3a5be53a..38ba1697dd61 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -111,7 +111,7 @@ retry: * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its compound_pincount will be incremented by @refs. + * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. @@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * try_get_folio() is left intact. */ if (folio_test_large(folio)) - atomic_add(refs, folio_pincount_ptr(folio)); + atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -182,7 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (flags & FOLL_PIN) { node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) - atomic_sub(refs, folio_pincount_ptr(folio)); + atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } @@ -232,7 +232,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); - atomic_add(1, folio_pincount_ptr(folio)); + atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13b1f67d14e..9570f03cdee4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2477,9 +2477,9 @@ static void __split_huge_page_tail(struct page *head, int tail, * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. * - * What of 32-bit systems, on which head[1].compound_pincount overlays + * What of 32-bit systems, on which folio->_pincount overlays * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + * pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 273a6522aa4c..15b2707c1600 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,7 +1476,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, atomic_set(folio_mapcount_ptr(folio), 0); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1998,7 +1998,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, } atomic_set(folio_mapcount_ptr(folio), -1); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); return true; out_error: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5514d84cc712..b224c2132ed1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -775,11 +775,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -1291,6 +1293,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1314,8 +1317,8 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero subpages_mapcount"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; From eec20426d48bd7b63c69969a793943ed1a99b731 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:48 +0000 Subject: [PATCH 158/505] mm: convert head_subpages_mapcount() into folio_nr_pages_mapped() Calling this 'mapcount' is confusing since mapcount is usually the number of times something is mapped; instead this is the number of mapped pages. It's also better to enforce that this is a folio rather than a head page. Move folio_nr_pages_mapped() into mm/internal.h since this is not something we want device drivers or filesystems poking at. Get rid of folio_subpages_mapcount_ptr() and use folio->_nr_pages_mapped directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++-------------------- include/linux/mm_types.h | 12 +++--------- mm/debug.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/internal.h | 18 ++++++++++++++++++ mm/rmap.c | 9 +++++---- 6 files changed, 32 insertions(+), 37 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d3945207067..2bdd08a5b8b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -843,24 +843,6 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED 0x800000 -#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ - return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -920,9 +902,9 @@ static inline bool folio_large_is_mapped(struct folio *folio) { /* * Reading folio_mapcount_ptr() below could be omitted if hugetlb - * participated in incrementing subpages_mapcount when compound mapped. + * participated in incrementing nr_pages_mapped when compound mapped. */ - return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + return atomic_read(&folio->_nr_pages_mapped) > 0 || atomic_read(folio_mapcount_ptr(folio)) >= 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6ff1d7db00a7..4751c67b98a6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -307,7 +307,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_subpages_mapcount: Do not use directly, call folio_mapcount(). + * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -361,7 +361,7 @@ struct folio { unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _compound_mapcount; - atomic_t _subpages_mapcount; + atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -404,7 +404,7 @@ FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _compound_mapcount); -FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); +FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -427,12 +427,6 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } -static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->subpages_mapcount; -} - static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; diff --git a/mm/debug.c b/mm/debug.c index 893c9dbf76ca..8e58e8dab0b2 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,10 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), - head_subpages_mapcount(head), + folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 15b2707c1600..c9702224931c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1475,7 +1475,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(folio_mapcount_ptr(folio), 0); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -1997,7 +1997,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, set_compound_head(p, &folio->page); } atomic_set(folio_mapcount_ptr(folio), -1); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; diff --git a/mm/internal.h b/mm/internal.h index 1d6f4e168510..583e15357e09 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 6ccd42bbae93..b573472b4ac3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,12 +1080,13 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, int total_compound_mapcount(struct page *head) { + struct folio *folio = (struct folio *)head; int mapcount = head_compound_mapcount(head); int nr_subpages; int i; /* In the common case, avoid the loop when no subpages mapped by PTE */ - if (head_subpages_mapcount(head) == 0) + if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* * Add all the PTE mappings of those subpages mapped by PTE. @@ -1233,7 +1234,7 @@ void page_add_anon_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1337,7 +1338,7 @@ void page_add_file_rmap(struct page *page, nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1399,7 +1400,7 @@ void page_remove_rmap(struct page *page, nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; From 6eee1a0062298601dfc36bd34517affc4458c43d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:49 +0000 Subject: [PATCH 159/505] doc: clarify refcount section by referring to folios & pages Include the rename of subpages_mapcount to _nr_pages_mapped. Link: https://lkml.kernel.org/r/20230111142915.1001531-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index ec3dc5b04226..03bbd0a19041 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -112,20 +112,20 @@ Refcounts and transparent huge pages Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate on head page's ->_refcount. + - get_page()/put_page() and GUP operate on the folio->_refcount. - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeeds on tail pages. - - map/unmap of PMD entry for the whole compound page increment/decrement - ->compound_mapcount, stored in the first tail page of the compound page; - and also increment/decrement ->subpages_mapcount (also in the first tail) - by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1. + - map/unmap of a PMD entry for the whole THP increment/decrement + folio->_entire_mapcount and also increment/decrement + folio->_nr_pages_mapped by COMPOUND_MAPPED when _entire_mapcount + goes from -1 to 0 or 0 to -1. - - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page, and also increment/decrement - ->subpages_mapcount, stored in first tail page of the compound page, when - _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. + - map/unmap of individual pages with PTE entry increment/decrement + page->_mapcount and also increment/decrement folio->_nr_pages_mapped + when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts + the number of pages mapped by PTE. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page From b14224fbea62e5bffd680613376fe1268f4103ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:50 +0000 Subject: [PATCH 160/505] mm: convert total_compound_mapcount() to folio_total_mapcount() Instead of enforcing that the argument must be a head page by naming, enforce it with the compiler by making it a folio. Also rename the counter in struct folio from _compound_mapcount to _entire_mapcount. Link: https://lkml.kernel.org/r/20230111142915.1001531-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 6 +++--- mm/rmap.c | 21 ++++++++++----------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2bdd08a5b8b4..bdf83e75bcd6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -871,7 +871,7 @@ static inline int page_mapcount(struct page *page) return head_compound_mapcount(page) + mapcount; } -int total_compound_mapcount(struct page *head); +int folio_total_mapcount(struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -888,14 +888,14 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return total_compound_mapcount(&folio->page); + return folio_total_mapcount(folio); } static inline int total_mapcount(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - return total_compound_mapcount(compound_head(page)); + return folio_total_mapcount(page_folio(page)); } static inline bool folio_large_is_mapped(struct folio *folio) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4751c67b98a6..70cbda768308 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -306,7 +306,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). @@ -360,7 +360,7 @@ struct folio { unsigned long _head_1; unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _compound_mapcount; + atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT @@ -403,7 +403,7 @@ FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(compound_mapcount, _entire_mapcount); FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT diff --git a/mm/rmap.c b/mm/rmap.c index b573472b4ac3..1418efc80027 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1078,27 +1078,26 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -int total_compound_mapcount(struct page *head) +int folio_total_mapcount(struct folio *folio) { - struct folio *folio = (struct folio *)head; - int mapcount = head_compound_mapcount(head); - int nr_subpages; + int mapcount = folio_entire_mapcount(folio); + int nr_pages; int i; - /* In the common case, avoid the loop when no subpages mapped by PTE */ + /* In the common case, avoid the loop when no pages mapped by PTE */ if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* - * Add all the PTE mappings of those subpages mapped by PTE. - * Limit the loop, knowing that only subpages_mapcount are mapped? + * Add all the PTE mappings of those pages mapped by PTE. + * Limit the loop to folio_nr_pages_mapped()? * Perhaps: given all the raciness, that may be a good or a bad idea. */ - nr_subpages = thp_nr_pages(head); - for (i = 0; i < nr_subpages; i++) - mapcount += atomic_read(&head[i]._mapcount); + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + mapcount += atomic_read(&folio_page(folio, i)->_mapcount); /* But each of those _mapcounts was based on -1 */ - mapcount += nr_subpages; + mapcount += nr_pages; return mapcount; } From 62beb906ef644b0f0555b2b9f9626c27e2038d84 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:51 +0000 Subject: [PATCH 161/505] mm: convert page_remove_rmap() to use a folio internally The API for page_remove_rmap() needs to be page-based, because we can remove mappings of pages individually. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 1418efc80027..7762ab8371c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1365,19 +1365,21 @@ void page_add_file_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_remove_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool last; + enum node_stat_item idx; VM_BUG_ON_PAGE(compound && !PageHead(page), page); /* Hugetlb pages are not counted in NR_*MAPPED */ - if (unlikely(PageHuge(page))) { + if (unlikely(folio_test_hugetlb(folio))) { /* hugetlb pages are always mapped with pmds */ - atomic_dec(compound_mapcount_ptr(page)); + atomic_dec(&folio->_entire_mapcount); return; } @@ -1385,20 +1387,18 @@ void page_remove_rmap(struct page *page, if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; - if (last && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (last && folio_test_large(folio)) { nr = atomic_dec_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - last = atomic_add_negative(-1, compound_mapcount_ptr(page)); + last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { - mapped = subpages_mapcount_ptr(page); nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) @@ -1411,21 +1411,26 @@ void page_remove_rmap(struct page *page, } if (nr_pmdmapped) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : - (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : - NR_FILE_PMDMAPPED), -nr_pmdmapped); + if (folio_test_anon(folio)) + idx = NR_ANON_THPS; + else if (folio_test_swapbacked(folio)) + idx = NR_SHMEM_PMDMAPPED; + else + idx = NR_FILE_PMDMAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); } if (nr) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : - NR_FILE_MAPPED, -nr); + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr); + /* - * Queue anon THP for deferred split if at least one small - * page of the compound page is unmapped, but at least one - * small page is still mapped. + * Queue anon THP for deferred split if at least one + * page of the folio is unmapped and at least one page + * is still mapped. */ - if (PageTransCompound(page) && PageAnon(page)) + if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) - deferred_split_huge_page(compound_head(page)); + deferred_split_huge_page(&folio->page); } /* From ee0800c2f6a9e605947ce499d79fb7e2be16d6dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:52 +0000 Subject: [PATCH 162/505] mm: convert page_add_anon_rmap() to use a folio internally The API for page_add_anon_rmap() needs to be page-based, because we can add mappings of individual pages. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7762ab8371c0..7d5e0bf98c1e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1207,10 +1207,11 @@ static void __page_check_anon_rmap(struct page *page, * and to ensure that PageAnon is not being upgraded racily to PageKsm * (but PageKsm is never downgraded to PageAnon). */ -void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, rmap_t flags) +void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first = true; @@ -1219,20 +1220,18 @@ void page_add_anon_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1248,11 +1247,11 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (nr_pmdmapped) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - if (likely(!PageKsm(page))) { + if (likely(!folio_test_ksm(folio))) { /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, From eb01a2ad7e9cba1b9dd131edc5a26ffbda90a5ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:53 +0000 Subject: [PATCH 163/505] mm: convert page_add_file_rmap() to use a folio internally The API for page_add_file_rmap() needs to be page-based, because we can add mappings of individual pages. But inside the function, we want to only call compound_head() once and then use the folio APIs instead of the page APIs that each call compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7d5e0bf98c1e..430c066c2295 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1309,10 +1309,11 @@ void page_add_new_anon_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool first; @@ -1322,20 +1323,18 @@ void page_add_file_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); + nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1348,10 +1347,10 @@ void page_add_file_rmap(struct page *page, } if (nr_pmdmapped) - __mod_lruvec_page_state(page, PageSwapBacked(page) ? + __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); mlock_vma_page(page, vma, compound); } From 4d510f3da4c216d4c2695395f67aec38e2aa6cc7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:54 +0000 Subject: [PATCH 164/505] mm: add folio_add_new_anon_rmap() In contrast to other rmap functions, page_add_new_anon_rmap() is always called with a freshly allocated page. That means it can't be called with a tail page. Turn page_add_new_anon_rmap() into folio_add_new_anon_rmap() and add a page_add_new_anon_rmap() wrapper. Callers can be converted individually. [akpm@linux-foundation.org: fix NOMMU build. page_add_new_anon_rmap() requires CONFIG_MMU] [willy@infradead.org: folio-compat.c needs rmap.h] Link: https://lkml.kernel.org/r/20230111142915.1001531-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 ++ mm/folio-compat.c | 11 +++++++++++ mm/rmap.c | 37 ++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..aa682a2a93ce 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -194,6 +194,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); +void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 69ed25790c68..18c48b557926 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "internal.h" @@ -123,3 +124,13 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_MMU +void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + return folio_add_new_anon_rmap((struct folio *)page, vma, address); +} +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 430c066c2295..513e23e5a158 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1264,41 +1264,40 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } /** - * page_add_new_anon_rmap - add mapping to a new anonymous page - * @page: the page to add the mapping to + * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. + * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * If it's a compound page, it is accounted as a compound page. As the page - * is new, it's assume to get mapped exclusively by a single process. - * - * Same as page_add_anon_rmap but must only be called on *new* pages. + * Like page_add_anon_rmap() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * Page does not have to be locked. + * The folio does not have to be locked. + * + * If the folio is large, it is accounted as a THP. As the folio + * is new, it's assumed to be mapped exclusively by a single process. */ -void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long address) { int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - __SetPageSwapBacked(page); + __folio_set_swapbacked(folio); - if (likely(!PageCompound(page))) { + if (likely(!folio_test_pmd_mappable(folio))) { /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); + atomic_set(&folio->_mapcount, 0); nr = 1; } else { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); - nr = thp_nr_pages(page); - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); + nr = folio_nr_pages(folio); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); } - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(page, vma, address, 1); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); + __page_set_anon_rmap(&folio->page, vma, address, 1); } /** From 65a689f35ad7ebbfb79f429c1bc290b042ebb10b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:55 +0000 Subject: [PATCH 165/505] page_alloc: use folio fields directly Rmove the uses of compound_mapcount_ptr(), head_compound_mapcount() and subpages_mapcount_ptr() Link: https://lkml.kernel.org/r/20230111142915.1001531-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b224c2132ed1..f15e0e15243f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -779,8 +779,8 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); } @@ -1309,12 +1309,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(head_compound_mapcount(head_page))) { - bad_page(page, "nonzero compound_mapcount"); + if (unlikely(folio_entire_mapcount(folio))) { + bad_page(page, "nonzero entire_mapcount"); goto out; } - if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { - bad_page(page, "nonzero subpages_mapcount"); + if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + bad_page(page, "nonzero nr_pages_mapped"); goto out; } if (unlikely(atomic_read(&folio->_pincount))) { From db4e5dbdcdd55482ab23bf4a0ae6746f93efb0d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:56 +0000 Subject: [PATCH 166/505] mm: use a folio in hugepage_add_anon_rmap() and hugepage_add_new_anon_rmap() Remove uses of compound_mapcount_ptr() Link: https://lkml.kernel.org/r/20230111142915.1001531-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 513e23e5a158..0020474f46c1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2528,13 +2528,14 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma = vma->anon_vma; int first; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) @@ -2545,10 +2546,12 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct folio *folio = page_folio(page); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - ClearHPageRestoreReserve(page); + atomic_set(&folio->_entire_mapcount, 0); + folio_clear_hugetlb_restore_reserve(folio); __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ From c7f84b5723f1a60becd79d895ab214a7d5ee93c1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:57 +0000 Subject: [PATCH 167/505] mm: use entire_mapcount in __page_dup_rmap() Remove the use of the compound_mapcount_ptr() wrapper, and add an assertion that we're not passing a tail page if we're duplicating a PMD. Link: https://lkml.kernel.org/r/20230111142915.1001531-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index aa682a2a93ce..a6bd1f0a183d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -208,7 +208,14 @@ void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, static inline void __page_dup_rmap(struct page *page, bool compound) { - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); + if (compound) { + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + atomic_inc(&folio->_entire_mapcount); + } else { + atomic_inc(&page->_mapcount); + } } static inline void page_dup_file_rmap(struct page *page, bool compound) From 91ec7f284a0c445b251caba942c993ebf7b6db9f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:58 +0000 Subject: [PATCH 168/505] mm/debug: remove call to head_compound_mapcount() Call folio_entire_mapcount() instead. Link: https://lkml.kernel.org/r/20230111142915.1001531-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 8e58e8dab0b2..9d3d893dc7f4 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,9 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), - head_compound_mapcount(head), + folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } From 46f2722825983a51e849eb0ef2814e5c7f040fef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:59 +0000 Subject: [PATCH 169/505] hugetlb: remove uses of folio_mapcount_ptr Use the entire_mapcount field directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9702224931c..a68e0e597a8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1474,7 +1474,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, int nr_pages = 1 << order; struct page *p; - atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -1996,7 +1996,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } - atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; From c97eeb8f260dba098ba775e37d216f81f28559a9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:00 +0000 Subject: [PATCH 170/505] mm: convert page_mapcount() to use folio_entire_mapcount() Remove a use of head_compound_mapcount(). Link: https://lkml.kernel.org/r/20230111142915.1001531-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bdf83e75bcd6..a6afa6c51a4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -853,22 +853,26 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount of compound_head of page. +/** + * page_mapcount() - Number of times this precise page is mapped. + * @page: The page. * - * Result is undefined for pages which cannot be mapped into userspace. + * The number of times this page is mapped. If this page is part of + * a large folio, it includes the number of times this page is mapped + * as part of that folio. + * + * The result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * They use this field in struct page differently. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; - if (likely(!PageCompound(page))) - return mapcount; - page = compound_head(page); - return head_compound_mapcount(page) + mapcount; + if (unlikely(PageCompound(page))) + mapcount += folio_entire_mapcount(page_folio(page)); + + return mapcount; } int folio_total_mapcount(struct folio *folio); From 1aa4d03b60c0f61a8d96d5d633bf7968dbf6841f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:01 +0000 Subject: [PATCH 171/505] mm: remove head_compound_mapcount() and _ptr functions folio_mapcount_ptr(), compound_mapcount_ptr() and subpages_mapcount_ptr() are all now unused. Link: https://lkml.kernel.org/r/20230111142915.1001531-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++------------ include/linux/mm_types.h | 16 ---------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6afa6c51a4d..7ff6e2410aa3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -831,16 +831,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - return atomic_read(folio_mapcount_ptr(folio)) + 1; -} - -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. - * Must be called only on head of compound page. - */ -static inline int head_compound_mapcount(struct page *head) -{ - return atomic_read(compound_mapcount_ptr(head)) + 1; + return atomic_read(&folio->_entire_mapcount) + 1; } /* @@ -905,11 +896,11 @@ static inline int total_mapcount(struct page *page) static inline bool folio_large_is_mapped(struct folio *folio) { /* - * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * Reading _entire_mapcount below could be omitted if hugetlb * participated in incrementing nr_pages_mapped when compound mapped. */ return atomic_read(&folio->_nr_pages_mapped) > 0 || - atomic_read(folio_mapcount_ptr(folio)) >= 0; + atomic_read(&folio->_entire_mapcount) >= 0; } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 70cbda768308..ffcf21fbaaf0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -421,22 +421,6 @@ FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH -static inline atomic_t *folio_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->compound_mapcount; -} - -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - -static inline atomic_t *subpages_mapcount_ptr(struct page *page) -{ - return &page[1].subpages_mapcount; -} - /* * Used for sizing the vmemmap region on some architectures */ From 5eb5cea11dcbafaa37685bc4e89e1d4ae9c434ea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:02 +0000 Subject: [PATCH 172/505] mm: reimplement compound_order() Make compound_order() use struct folio. It can't be turned into a wrapper around folio_order() as a page can be turned into a tail page between a check in compound_order() and the assertion in folio_test_large(). Link: https://lkml.kernel.org/r/20230111142915.1001531-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ff6e2410aa3..3adc37cebe6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,11 +719,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before _folio_order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ static inline unsigned int compound_order(struct page *page) { - if (!PageHead(page)) + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) return 0; - return page[1].compound_order; + return folio->_folio_order; } /** From 21a000fe97a018c6d25be63892afb4fd8210ab57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:03 +0000 Subject: [PATCH 173/505] mm: reimplement compound_nr() Turn compound_nr() into a wrapper around folio_nr_pages(). Similarly to compound_order(), casting the struct page directly to struct folio preserves the existing behaviour, while calling page_folio() would change the behaviour. Move thp_nr_pages() down in the file so that compound_nr() can be after folio_nr_pages(). [willy@infradead.org: fix assertion triggering] Link: https://lkml.kernel.org/r/Y8AFgZEEjnUIaCbf@casper.infradead.org Link: https://lkml.kernel.org/r/20230111142915.1001531-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3adc37cebe6f..3acf09d5b0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1005,18 +1005,6 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ - if (!PageHead(page)) - return 1; -#ifdef CONFIG_64BIT - return page[1].compound_nr; -#else - return 1UL << compound_order(page); -#endif -} - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { @@ -1039,16 +1027,6 @@ static inline unsigned int thp_order(struct page *page) return compound_order(page); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return compound_nr(page); -} - /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. @@ -1758,6 +1736,33 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + return folio_nr_pages((struct folio *)page); +} + /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. From bad6da64565846ef5ba85b0b685cfde9db0085dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:04 +0000 Subject: [PATCH 174/505] mm: convert set_compound_page_dtor() and set_compound_order() to folios Replace uses of compound_dtor, compound_order and compound_nr by their folio equivalents. Link: https://lkml.kernel.org/r/20230111142915.1001531-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3acf09d5b0bd..836b96e08a14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -984,8 +984,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) { + struct folio *folio = (struct folio *)page; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - page[1].compound_dtor = compound_dtor; + VM_BUG_ON_PAGE(!PageHead(page), page); + folio->_folio_dtor = compound_dtor; } static inline void folio_set_compound_dtor(struct folio *folio, @@ -999,9 +1002,11 @@ void destroy_large_folio(struct folio *folio); static inline void set_compound_order(struct page *page, unsigned int order) { - page[1].compound_order = order; + struct folio *folio = (struct folio *)page; + + folio->_folio_order = order; #ifdef CONFIG_64BIT - page[1].compound_nr = 1U << order; + folio->_folio_nr_pages = 1U << order; #endif } From f04029f34e8c3750b8fb39b54e788b9355d1b912 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:05 +0000 Subject: [PATCH 175/505] mm: convert is_transparent_hugepage() to use a folio Replace a use of page->compound_dtor with its folio equivalent. Link: https://lkml.kernel.org/r/20230111142915.1001531-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9570f03cdee4..bfa960f012fa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -591,12 +591,14 @@ void prep_transhuge_page(struct page *page) static inline bool is_transparent_hugepage(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return false; - page = compound_head(page); - return is_huge_zero_page(page) || - page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; + folio = page_folio(page); + return is_huge_zero_page(&folio->page) || + folio->_folio_dtor == TRANSHUGE_PAGE_DTOR; } static unsigned long __thp_get_unmapped_area(struct file *filp, From a60d5942cc9bd65806b69360020d9b1664c747ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:06 +0000 Subject: [PATCH 176/505] mm: convert destroy_large_folio() to use folio_dtor Replace a use of compound_dtor. Link: https://lkml.kernel.org/r/20230111142915.1001531-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f15e0e15243f..88494e82843d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,7 +807,7 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + enum compound_dtor_id dtor = folio->_folio_dtor; VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); From 2d678c641a4625d2b1cfeb50d7426fab6d3740b3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:07 +0000 Subject: [PATCH 177/505] hugetlb: remove uses of compound_dtor and compound_nr Convert the entire file to use the folio equivalents. Link: https://lkml.kernel.org/r/20230111142915.1001531-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a68e0e597a8f..ca9e177b9c54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2038,11 +2038,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, */ int PageHuge(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return 0; - - page = compound_head(page); - return page[1].compound_dtor == HUGETLB_PAGE_DTOR; + folio = page_folio(page); + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -2052,10 +2053,11 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - if (!PageHead(page_head)) + struct folio *folio = (struct folio *)page_head; + if (!folio_test_large(folio)) return 0; - return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHeadHuge); From 1c5509be58f636afabbdaf66e7436da8ec0a1828 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:08 +0000 Subject: [PATCH 178/505] mm: remove 'First tail page' members from struct page All former users now use the folio equivalents, so remove them from the definition of struct page. Link: https://lkml.kernel.org/r/20230111142915.1001531-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 18 ------------------ kernel/crash_core.c | 4 ++-- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ffcf21fbaaf0..94b1707f5d33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -140,16 +140,6 @@ struct page { }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - atomic_t compound_mapcount; - atomic_t subpages_mapcount; - atomic_t compound_pincount; -#ifdef CONFIG_64BIT - unsigned int compound_nr; /* 1 << compound_order */ -#endif }; struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ @@ -401,14 +391,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -FOLIO_MATCH(compound_dtor, _folio_dtor); -FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _entire_mapcount); -FOLIO_MATCH(subpages_mapcount, _nr_pages_mapped); -FOLIO_MATCH(compound_pincount, _pincount); -#ifdef CONFIG_64BIT -FOLIO_MATCH(compound_nr, _folio_nr_pages); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 87ef6096823f..755f5f08ab38 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(folio, _folio_dtor); + VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); From a8d55327ccc1f999a5fba4eee67ed08bd36493ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:09 +0000 Subject: [PATCH 179/505] doc: correct struct folio kernel-doc Insert appropriate public: and private: markers to make the generated kernel-doc look right. Link: https://lkml.kernel.org/r/20230111142915.1001531-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 94b1707f5d33..d458e9b8496c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -292,16 +292,12 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_flags_1: For large folios, additional page flags. - * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_flags_2: For alignment. Do not use. - * @_head_2: Points to the folio. Do not use. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -348,6 +344,7 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + /* public: */ unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _entire_mapcount; @@ -356,6 +353,7 @@ struct folio { #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -363,10 +361,12 @@ struct folio { struct { unsigned long _flags_2; unsigned long _head_2; + /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ }; struct page __page_2; }; From 4375a553f46c6cb66d1711d8f514dfdf34ce74b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:10 +0000 Subject: [PATCH 180/505] mm: move page->deferred_list to folio->_deferred_list Remove the entire block of definitions for the second tail page, and add the deferred list to the struct folio. This actually moves _deferred_list to a different offset in struct folio because I don't see a need to include the padding. This lets us use list_for_each_entry_safe() in deferred_split_scan() and avoid a number of calls to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 ++++----- include/linux/mm_types.h | 14 ++++++++------ mm/huge_memory.c | 32 +++++++++++++++----------------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1341fdcf666..aacfcb02606f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -295,11 +295,10 @@ static inline bool thp_migration_supported(void) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * See organization of tail pages of compound page in - * "struct page" definition. - */ - return &page[2].deferred_list; + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + return &folio->_deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d458e9b8496c..7eb4d0815a78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,12 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Second tail page of transparent huge page */ - unsigned long _compound_pad_1; /* compound_head */ - unsigned long _compound_pad_2; - /* For both global and memcg */ - struct list_head deferred_list; - }; struct { /* Second tail page of hugetlb page */ unsigned long _hugetlb_pad_1; /* compound_head */ void *hugetlb_subpool; @@ -302,6 +296,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). + * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -366,6 +361,13 @@ struct folio { void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ + }; + struct { + unsigned long _flags_2a; + unsigned long _head_2a; + /* public: */ + struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; struct page __page_2; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bfa960f012fa..a4138daaa0b8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2756,9 +2756,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(&folio->page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(&folio->page)); + list_del(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { @@ -2873,8 +2873,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; - LIST_HEAD(list), *pos, *next; - struct page *page; + LIST_HEAD(list); + struct folio *folio, *next; int split = 0; #ifdef CONFIG_MEMCG @@ -2884,14 +2884,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &ds_queue->split_queue) { - page = list_entry((void *)pos, struct page, deferred_list); - page = compound_head(page); - if (get_page_unless_zero(page)) { - list_move(page_deferred_list(page), &list); + list_for_each_entry_safe(folio, next, &ds_queue->split_queue, + _deferred_list) { + if (folio_try_get(folio)) { + list_move(&folio->_deferred_list, &list); } else { - /* We lost race with put_compound_page() */ - list_del_init(page_deferred_list(page)); + /* We lost race with folio_put() */ + list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) @@ -2899,16 +2898,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - list_for_each_safe(pos, next, &list) { - page = list_entry((void *)pos, struct page, deferred_list); - if (!trylock_page(page)) + list_for_each_entry_safe(folio, next, &list, _deferred_list) { + if (!folio_trylock(folio)) goto next; /* split_huge_page() removes page from list on success */ - if (!split_huge_page(page)) + if (!split_folio(folio)) split++; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); From 8991de90e99755b13026b1db32d1fa52e94c6a96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:11 +0000 Subject: [PATCH 181/505] mm/huge_memory: remove page_deferred_list() Use folio->_deferred_list directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 -------- mm/huge_memory.c | 34 +++++++++++++++++----------------- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index aacfcb02606f..b9978978a160 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -293,14 +293,6 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } -static inline struct list_head *page_deferred_list(struct page *page) -{ - struct folio *folio = (struct folio *)page; - - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - return &folio->_deferred_list; -} - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a4138daaa0b8..7aedfe7cf5df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -580,12 +580,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) void prep_transhuge_page(struct page *page) { - /* - * we use page->mapping and page->index in second tail page - * as list_head: assuming THP order >= 2 - */ + struct folio *folio = (struct folio *)page; - INIT_LIST_HEAD(page_deferred_list(page)); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + INIT_LIST_HEAD(&folio->_deferred_list); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } @@ -2802,13 +2800,14 @@ out: void free_transhuge_page(struct page *page) { + struct folio *folio = (struct folio *)page; struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(page_deferred_list(page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(page)); + list_del(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); @@ -2816,38 +2815,39 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { + struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = page_memcg(compound_head(page)); + struct mem_cgroup *memcg = folio_memcg(folio); #endif unsigned long flags; - VM_BUG_ON_PAGE(!PageTransHuge(page), page); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same page, it is + * And, if page reclaim is already handling the same folio, it is * unnecessary to handle it again in shrinker. * - * Check PageSwapCache to determine if the page is being - * handled by page reclaim since THP swap would add the page into + * Check the swapcache flag to determine if the folio is being + * handled by page reclaim since THP swap would add the folio into * swap cache before calling try_to_unmap(). */ - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) return; - if (!list_empty(page_deferred_list(page))) + if (!list_empty(&folio->_deferred_list)) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (list_empty(page_deferred_list(page))) { + if (list_empty(&folio->_deferred_list)) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - set_shrinker_bit(memcg, page_to_nid(page), + set_shrinker_bit(memcg, folio_nid(folio), deferred_split_shrinker.id); #endif } From f8baa6be0368b5d21be34e8bf071b563b0f77584 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:12 +0000 Subject: [PATCH 182/505] mm/huge_memory: convert get_deferred_split_queue() to take a folio Removes a few calls to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7aedfe7cf5df..c23b0e01734b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -559,10 +559,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) } #ifdef CONFIG_MEMCG -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct mem_cgroup *memcg = page_memcg(compound_head(page)); - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); if (memcg) return &memcg->deferred_split_queue; @@ -570,9 +571,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) return &pgdat->deferred_split_queue; } #else -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); return &pgdat->deferred_split_queue; } @@ -2650,7 +2652,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -2801,7 +2803,7 @@ out: void free_transhuge_page(struct page *page) { struct folio *folio = (struct folio *)page; - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2816,7 +2818,7 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { struct folio *folio = page_folio(page); - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); #endif From f158ed6195ef949060811fd85086928470651944 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:13 +0000 Subject: [PATCH 183/505] mm: convert deferred_split_huge_page() to deferred_split_folio() Now that both callers use a folio, pass the folio in and save a call to compound_head(). Link: https://lkml.kernel.org/r/20230111142915.1001531-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 6 +++--- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 3 +-- mm/rmap.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 03bbd0a19041..a9608fe51649 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -153,8 +153,8 @@ clear where references should go after split: it will stay on the head page. Note that split_huge_pmd() doesn't have any limitations on refcounting: pmd can be split at any point and never fails. -Partial unmap and deferred_split_huge_page() -============================================ +Partial unmap and deferred_split_folio() +======================================== Unmapping part of THP (with munmap() or other way) is not going to free memory immediately. Instead, we detect that a subpage of THP is not in use @@ -166,6 +166,6 @@ the place where we can detect partial unmap. It also might be counterproductive since in many cases partial unmap happens during exit(2) if a THP crosses a VMA boundary. -The function deferred_split_huge_page() is used to queue a page for splitting. +The function deferred_split_folio() is used to queue a folio for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b9978978a160..70bd867eba94 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -187,7 +187,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } -void deferred_split_huge_page(struct page *page); +void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -340,7 +340,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_huge_page(struct page *page) {} +static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c23b0e01734b..868fcccdff72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2815,9 +2815,8 @@ void free_transhuge_page(struct page *page) free_compound_page(page); } -void deferred_split_huge_page(struct page *page) +void deferred_split_folio(struct folio *folio) { - struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 0020474f46c1..a079d9964b9c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1427,7 +1427,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, */ if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) - deferred_split_huge_page(&folio->page); + deferred_split_folio(folio); } /* From 6a171c16e62f854e6a7e0f837dbe8f3ace0f00ce Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 11 Jan 2023 14:29:14 +0000 Subject: [PATCH 184/505] mm: remove the hugetlb field from struct page Patch series "Get rid of tail page fields". Continue the shrinkage of the struct page definition by getting rid of the 'first tail page' and 'second tail page' fields. I originally did this patch set before Hugh's rewrite of the subpages_mapcount, so it needed substantial updates; hope I didn't miss anything. This patch (of 28): commit dad6a5eb5556(mm,hugetlb: use folio fields in second tail page) added a transitional hugetlb field to struct page and struct folio to make room for another int in the first tail of a compound page. Hugetlb folio conversions have changed all page users of this field to use the fields within the folio so struct page no longer needs this hugetlb specific field. Link: https://lkml.kernel.org/r/20230111142915.1001531-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230111142915.1001531-29-willy@infradead.org Signed-off-by: Sidhartha Kumar Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7eb4d0815a78..452920467223 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,14 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Second tail page of hugetlb page */ - unsigned long _hugetlb_pad_1; /* compound_head */ - void *hugetlb_subpool; - void *hugetlb_cgroup; - void *hugetlb_cgroup_rsvd; - void *hugetlb_hwpoison; - /* No more space on 32-bit: use third tail if more */ - }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -399,10 +391,6 @@ FOLIO_MATCH(compound_head, _head_1); offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); -FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); -FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); -FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH /* From f942b0f0528d1198b94b8211c84d4f28a654c0ff Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 11 Jan 2023 21:53:48 +0800 Subject: [PATCH 185/505] maple_tree: fix comment of mte_destroy_walk The parameter name of maple tree is mt, make the comment be mt instead of mn, and the separator between the parameter name and the description to be : instead of -. Link: https://lkml.kernel.org/r/20230111135348.803181-1-vernon2gm@gmail.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Vernon Yang Cc: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5be99550e36d..1c5d3b640a24 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5579,8 +5579,8 @@ free_leaf: /* * mte_destroy_walk() - Free a tree or sub-tree. - * @enode - the encoded maple node (maple_enode) to start - * @mn - the tree to free - needed for node types. + * @enode: the encoded maple node (maple_enode) to start + * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ From 82b249361f2d1627acc7dbbdab1c80246a02fd4a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 11 Jan 2023 21:20:36 +0800 Subject: [PATCH 186/505] mm/mmap: fix comment of unmapped_area{_topdown} The low_limit of unmapped area information is inclusive, and the hight_limit is not, so make symbol to be [ instead of (. And replace hight_limit to high_limit. Link: https://lkml.kernel.org/r/20230111132036.801404-1-vernon2gm@gmail.com Fixes: 3499a13168da ("mm/mmap: use maple tree for unmapped_area{_topdown}") Signed-off-by: Vernon Yang Cc: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/mmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 0641e6e0016c..335ba3df9898 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1558,8 +1558,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ @@ -1585,11 +1585,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) /** * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with * the correct alignment and offset at the highest available + * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ From 1e15d374bb1cb95f738334018a216e4f6360e821 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 11 Jan 2023 11:18:06 +0100 Subject: [PATCH 187/505] Revert "x86: kmsan: sync metadata pages on page fault" This reverts commit 3f1e2c7a9099c1ed32c67f12cdf432ba782cf51f. As noticed by Qun-Wei Lin, arch_sync_kernel_mappings() in arch/x86/mm/fault.c is only used with CONFIG_X86_32, whereas KMSAN is only supported on x86_64, where this code is not compiled. The patch in question dates back to downstream KMSAN branch based on v5.8-rc5, it sneaked into upstream unnoticed in v6.1. Link: https://lkml.kernel.org/r/20230111101806.3236991-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: Qun-Wei Lin Link: https://github.com/google/kmsan/issues/91 Cc: Andy Lutomirski Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8..a498ae1fbe66 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,27 +284,6 @@ static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) -{ - __arch_sync_kernel_mappings(start, end); -#ifdef CONFIG_KMSAN - /* - * KMSAN maintains two additional metadata page mappings for the - * [VMALLOC_START, VMALLOC_END) range. These mappings start at - * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and - * have to be synced together with the vmalloc memory mapping. - */ - if (start >= VMALLOC_START && end < VMALLOC_END) { - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, - end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, - end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); - } -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; From 4c110ec98e39944732ec31bf0415f22632bae2b7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:01 -0600 Subject: [PATCH 188/505] mm/memory-failure: convert __get_huge_page_for_hwpoison() to folios Patch series "convert hugepage memory failure functions to folios". This series contains a 1:1 straightforward page to folio conversion for memory failure functions which deal with huge pages. I renamed a few functions to fit with how other folio operating functions are named. These include: hugetlb_clear_page_hwpoison -> folio_clear_hugetlb_hwpoison free_raw_hwp_pages -> folio_free_raw_hwp __free_raw_hwp_pages -> __folio_free_raw_hwp hugetlb_set_page_hwpoison -> folio_set_hugetlb_hwpoison The goal of this series was to reduce users of the hugetlb specific page flag macros which take in a page so users are protected by the compiler to make sure they are operating on a head page. This patch (of 8): Use a folio throughout the function rather than using a head page. This also reduces the users of the page version of hugetlb specific page flags. Link: https://lkml.kernel.org/r/20230112204608.80136-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6bf07345ea2c..cca28f19ad99 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1807,20 +1807,20 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 2; /* fallback to normal page handling */ bool count_increased = false; - if (!PageHeadHuge(head)) + if (!folio_test_hugetlb(folio)) goto out; if (flags & MF_COUNT_INCREASED) { ret = 1; count_increased = true; - } else if (HPageFreed(head)) { + } else if (folio_test_hugetlb_freed(folio)) { ret = 0; - } else if (HPageMigratable(head)) { - ret = get_page_unless_zero(head); + } else if (folio_test_hugetlb_migratable(folio)) { + ret = folio_try_get(folio); if (ret) count_increased = true; } else { @@ -1829,24 +1829,24 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, goto out; } - if (hugetlb_set_page_hwpoison(head, page)) { + if (hugetlb_set_page_hwpoison(&folio->page, page)) { ret = -EHWPOISON; goto out; } /* - * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them * from being migrated by memory hotremove. */ - if (count_increased && HPageMigratable(head)) { - ClearHPageMigratable(head); + if (count_increased && folio_test_hugetlb_migratable(folio)) { + folio_clear_hugetlb_migratable(folio); *migratable_cleared = true; } return ret; out: if (count_increased) - put_page(head); + folio_put(folio); return ret; } From bc1cfde194675215857755b75e5fe90f6a654843 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:02 -0600 Subject: [PATCH 189/505] mm/memory-failure: convert try_memory_failure_hugetlb() to folios Use a struct folio rather than a head page in try_memory_failure_hugetlb. This converts one user of SetHPageMigratable to the folio equivalent. Link: https://lkml.kernel.org/r/20230112204608.80136-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cca28f19ad99..19f6035608d5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1860,7 +1860,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb { int res; struct page *p = pfn_to_page(pfn); - struct page *head; + struct folio *folio; unsigned long page_flags; bool migratable_cleared = false; @@ -1873,8 +1873,8 @@ retry: } else if (res == -EHWPOISON) { pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { - head = compound_head(p); - res = kill_accessing_process(current, page_to_pfn(head), flags); + folio = page_folio(p); + res = kill_accessing_process(current, folio_pfn(folio), flags); } return res; } else if (res == -EBUSY) { @@ -1885,16 +1885,16 @@ retry: return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } - head = compound_head(p); - lock_page(head); + folio = page_folio(p); + folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(head); + hugetlb_clear_page_hwpoison(&folio->page); if (migratable_cleared) - SetHPageMigratable(head); - unlock_page(head); + folio_set_hugetlb_migratable(folio); + folio_unlock(folio); if (res == 1) - put_page(head); + folio_put(folio); return -EOPNOTSUPP; } @@ -1903,7 +1903,7 @@ retry: * or demotion can be prevented by PageHWPoison flag. */ if (res == 0) { - unlock_page(head); + folio_unlock(folio); if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; @@ -1913,10 +1913,10 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = head->flags; + page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, head)) { - unlock_page(head); + if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } From 2ff6cecee669bf0fc63eadebac8cfc81f74b9a4c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:03 -0600 Subject: [PATCH 190/505] mm/memory-failure: convert hugetlb_clear_page_hwpoison to folios Change hugetlb_clear_page_hwpoison() to folio_clear_hugetlb_hwpoison() by changing the function to take in a folio. This converts one use of ClearPageHWPoison and HPageRawHwpUnreliable to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 2 +- mm/memory-failure.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d6413c3b8f5..cf60fe741c1d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -878,9 +878,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE -extern void hugetlb_clear_page_hwpoison(struct page *hpage); +extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else -static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ca9e177b9c54..291ad4cb02f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1731,7 +1731,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19f6035608d5..d4aaed2756af 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1785,12 +1785,12 @@ static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) return __free_raw_hwp_pages(hpage, move_flag); } -void hugetlb_clear_page_hwpoison(struct page *hpage) +void folio_clear_hugetlb_hwpoison(struct folio *folio) { - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - ClearPageHWPoison(hpage); - free_raw_hwp_pages(hpage, true); + folio_clear_hwpoison(folio); + free_raw_hwp_pages(&folio->page, true); } /* @@ -1889,7 +1889,7 @@ retry: folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); if (migratable_cleared) folio_set_hugetlb_migratable(folio); folio_unlock(folio); From 9637d7dfb19ce934f81cd56cde23573759c73afb Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:04 -0600 Subject: [PATCH 191/505] mm/memory-failure: convert free_raw_hwp_pages() to folios Change free_raw_hwp_pages() to folio_free_raw_hwp(), converts two users of hugetlb specific page macro users to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d4aaed2756af..a2835907caf8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1766,23 +1766,23 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) return ret; } -static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) { /* - * HPageVmemmapOptimized hugepages can't be freed because struct + * hugetlb_vmemmap_optimized hugepages can't be freed because struct * pages for tail pages are required but they don't exist. */ - if (move_flag && HPageVmemmapOptimized(hpage)) + if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio)) return 0; /* - * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by * definition. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return 0; - return __free_raw_hwp_pages(hpage, move_flag); + return __free_raw_hwp_pages(&folio->page, move_flag); } void folio_clear_hugetlb_hwpoison(struct folio *folio) @@ -1790,7 +1790,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; folio_clear_hwpoison(folio); - free_raw_hwp_pages(&folio->page, true); + folio_free_raw_hwp(folio, true); } /* @@ -1929,7 +1929,7 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int * return 0; } -static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) { return 0; } @@ -2336,6 +2336,7 @@ core_initcall(memory_failure_init); int unpoison_memory(unsigned long pfn) { struct page *page; + struct folio *folio; struct page *p; int ret = -EBUSY; unsigned long count = 1; @@ -2348,6 +2349,7 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2389,7 +2391,7 @@ int unpoison_memory(unsigned long pfn) if (!ret) { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; goto unlock_mutex; @@ -2405,7 +2407,7 @@ int unpoison_memory(unsigned long pfn) } else { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; put_page(page); From b02e7582ef245e9694fff6aee8e95fd1764cc5ee Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:05 -0600 Subject: [PATCH 192/505] mm/memory-failure: convert raw_hwp_list_head() to folios Change raw_hwp_list_head() to take in a folio and modify its callers to pass in a folio. Also converts two users of hugetlb specific page macro users to their folio equivalents. Link: https://lkml.kernel.org/r/20230112204608.80136-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a2835907caf8..6ae5f234bcc1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1695,9 +1695,9 @@ struct raw_hwp_page { struct page *page; }; -static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +static inline struct llist_head *raw_hwp_list_head(struct folio *folio) { - return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; + return (struct llist_head *)&folio->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) @@ -1705,8 +1705,9 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) struct llist_head *head; struct llist_node *t, *tnode; unsigned long count = 0; + struct folio *folio = page_folio(hpage); - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1727,15 +1728,16 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) struct raw_hwp_page *raw_hwp; struct llist_node *t, *tnode; int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + struct folio *folio = page_folio(hpage); /* * Once the hwpoison hugepage has lost reliable raw error info, * there is little meaning to keep additional error info precisely, * so skip to add additional raw error info. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1756,9 +1758,9 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * hwpoisoned subpages, and we need refuse to free/dissolve * this hwpoisoned hugepage. */ - SetHPageRawHwpUnreliable(hpage); + folio_set_hugetlb_raw_hwp_unreliable(folio); /* - * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not * used any more, so free it. */ __free_raw_hwp_pages(hpage, false); From 0858b5eb3aab2de0662a40901699162519628f6e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:06 -0600 Subject: [PATCH 193/505] mm/memory-failure: convert __free_raw_hwp_pages() to folios Change __free_raw_hwp_pages() to __folio_free_raw_hwp() and modify its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230112204608.80136-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6ae5f234bcc1..387fb9a9ee4e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1700,12 +1700,11 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) return (struct llist_head *)&folio->_hugetlb_hwpoison; } -static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { struct llist_head *head; struct llist_node *t, *tnode; unsigned long count = 0; - struct folio *folio = page_folio(hpage); head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { @@ -1763,7 +1762,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not * used any more, so free it. */ - __free_raw_hwp_pages(hpage, false); + __folio_free_raw_hwp(folio, false); } return ret; } @@ -1784,7 +1783,7 @@ static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return 0; - return __free_raw_hwp_pages(&folio->page, move_flag); + return __folio_free_raw_hwp(folio, move_flag); } void folio_clear_hugetlb_hwpoison(struct folio *folio) From 595dd8185cf1db248b2be4c65ec8936de6ac87c1 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:07 -0600 Subject: [PATCH 194/505] mm/memory-failure: convert hugetlb_set_page_hwpoison() to folios Change hugetlb_set_page_hwpoison() to folio_set_hugetlb_hwpoison() and use a folio internally. Link: https://lkml.kernel.org/r/20230112204608.80136-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 387fb9a9ee4e..3821d64e2b2a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1721,13 +1721,12 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) return count; } -static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; struct llist_node *t, *tnode; - int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; - struct folio *folio = page_folio(hpage); + int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* * Once the hwpoison hugepage has lost reliable raw error info, @@ -1830,7 +1829,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, goto out; } - if (hugetlb_set_page_hwpoison(&folio->page, page)) { + if (folio_set_hugetlb_hwpoison(folio, page)) { ret = -EHWPOISON; goto out; } From a6fddef49eef2cf68c23e91d73d6a6d5e2cd448f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 12 Jan 2023 14:46:08 -0600 Subject: [PATCH 195/505] mm/memory-failure: convert unpoison_memory() to folios Use a folio inside unpoison_memory which replaces a compound_head() call with a call to page_folio(). Link: https://lkml.kernel.org/r/20230112204608.80136-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3821d64e2b2a..ba0bbfc074ee 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2335,7 +2335,6 @@ core_initcall(memory_failure_init); */ int unpoison_memory(unsigned long pfn) { - struct page *page; struct folio *folio; struct page *p; int ret = -EBUSY; @@ -2348,7 +2347,6 @@ int unpoison_memory(unsigned long pfn) return -ENXIO; p = pfn_to_page(pfn); - page = compound_head(p); folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2360,31 +2358,31 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!PageHWPoison(p)) { + if (!folio_test_hwpoison(folio)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_count(page) > 1) { + if (folio_ref_count(folio) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapped(page)) { + if (folio_mapped(folio)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapping(page)) { + if (folio_mapping(folio)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (PageSlab(page) || PageTable(page) || PageReserved(page)) + if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); @@ -2397,7 +2395,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } } - ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; + ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { ret = put_page_back_buddy(p) ? 0 : -EBUSY; @@ -2410,14 +2408,14 @@ int unpoison_memory(unsigned long pfn) count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; - put_page(page); + folio_put(folio); goto unlock_mutex; } } - put_page(page); + folio_put(folio); if (TestClearPageHWPoison(p)) { - put_page(page); + folio_put(folio); ret = 0; } } From 69bbb87b3f144e4778f028fd85992aa8dea6ff28 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 Jan 2023 13:10:31 +0000 Subject: [PATCH 196/505] shmem: convert shmem_write_end() to use a folio Use a folio internally to shmem_write_end() which saves a number of calls to compound_head() and lets us get rid of the custom code to zero out the rest of a THP and supports folios of arbitrary size. Link: https://lkml.kernel.org/r/20230112131031.1209553-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bc5c156ef470..c5048c6c83dd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2578,33 +2578,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - if (!PageUptodate(page)) { - struct page *head = compound_head(page); - if (PageTransCompound(page)) { - int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) { - if (head + i == page) - continue; - clear_highpage(head + i); - flush_dcache_page(head + i); - } + if (!folio_test_uptodate(folio)) { + if (copied < folio_size(folio)) { + size_t from = offset_in_folio(folio, pos); + folio_zero_segments(folio, 0, from, + from + copied, folio_size(folio)); } - if (copied < PAGE_SIZE) { - unsigned from = pos & (PAGE_SIZE - 1); - zero_user_segments(page, 0, from, - from + copied, PAGE_SIZE); - } - SetPageUptodate(head); + folio_mark_uptodate(folio); } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } From 4947ed93c23218c40fb32226b895d8733cd2e05f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 Jan 2023 20:40:28 +0800 Subject: [PATCH 197/505] mm: madvise: use vm_normal_folio() in madvise_free_pte_range() There is already a vm_normal_folio(), use it to make madvise_free_pte_range() only use a folio. Link: https://lkml.kernel.org/r/20230112124028.16964-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/madvise.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5296e78dccda..92a3c6bd84c1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -617,7 +617,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; struct folio *folio; - struct page *page; int nr_swap = 0; unsigned long next; @@ -658,10 +657,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; - folio = page_folio(page); /* * If pmd isn't transhuge but the folio is large and From 8115612883978069fee8793f873a627ff5868718 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:28 +0000 Subject: [PATCH 198/505] mm: pagevec: add folio_batch_reinit() Patch series "update mlock to use folios", v4. This series updates mlock to use folios, converting the internal interface to using folios exclusively and exposing the folio interface externally. As a product of this we move to using a folio batch rather than a pagevec for mlock folios, which brings it in line with the core folio batches contained in mm/swap.c. This patch (of 5): This performs the same task as pagevec_reinit(), only modifying a folio batch rather than a pagevec. Link: https://lkml.kernel.org/r/cover.1673526881.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/9018cecacb39e34c883540f997f9be8281153613.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 215eb6c3bdc9..2a6f61a0c10a 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -103,6 +103,11 @@ static inline void folio_batch_init(struct folio_batch *fbatch) fbatch->percpu_pvec_drained = false; } +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; +} + static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; From 90d07210ab55e458c87048e1ad55582ecff0a3d5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:29 +0000 Subject: [PATCH 199/505] mm: mlock: use folios and a folio batch internally This brings mlock in line with the folio batches declared in mm/swap.c and makes the code more consistent across the two. The existing mechanism for identifying which operation each folio in the batch is undergoing is maintained, i.e. using the lower 2 bits of the struct folio address (previously struct page address). This should continue to function correctly as folios remain at least system word-aligned. All invocations of mlock() pass either a non-compound page or the head of a THP-compound page and no tail pages need updating so this functionality works with struct folios being used internally rather than struct pages. In this patch the external interface is kept identical to before in order to maintain separation between patches in the series, using a rather awkward conversion from struct page to struct folio in relevant functions. However, this maintenance of the existing interface is intended to be temporary - the next patch in the series will update the interfaces to accept folios directly. Link: https://lkml.kernel.org/r/9f894d54d568773f4ed3cb0eef5f8932f62c95f4.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/mlock.c | 246 +++++++++++++++++++++++++++-------------------------- 1 file changed, 124 insertions(+), 122 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 7032f6dd0ce1..f8e8d30ab08a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,12 +28,12 @@ #include "internal.h" -struct mlock_pvec { +struct mlock_fbatch { local_lock_t lock; - struct pagevec vec; + struct folio_batch fbatch; }; -static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = { +static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -48,192 +48,192 @@ bool can_do_mlock(void) EXPORT_SYMBOL(can_do_mlock); /* - * Mlocked pages are marked with PageMlocked() flag for efficient testing + * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * - * An mlocked page [PageMlocked(page)] is unevictable. As such, it will - * be placed on the LRU "unevictable" list, rather than the [in]active lists. - * The unevictable list is an LRU sibling list to the [in]active lists. - * PageUnevictable is set to indicate the unevictable state. + * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it + * will be ostensibly placed on the LRU "unevictable" list (actually no such + * list exists), rather than the [in]active lists. PG_unevictable is set to + * indicate the unevictable state. */ -static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) return lruvec; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (unlikely(page_evictable(page))) { + if (unlikely(folio_evictable(folio))) { /* - * This is a little surprising, but quite possible: - * PageMlocked must have got cleared already by another CPU. - * Could this page be on the Unevictable LRU? I'm not sure, - * but move it now if so. + * This is a little surprising, but quite possible: PG_mlocked + * must have got cleared already by another CPU. Could this + * folio be unevictable? I'm not sure, but move it now if so. */ - if (PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + if (folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGRESCUED, - thp_nr_pages(page)); + folio_nr_pages(folio)); } goto out; } - if (PageUnevictable(page)) { - if (PageMlocked(page)) - page->mlock_count++; + if (folio_test_unevictable(folio)) { + if (folio_test_mlocked(folio)) + folio->mlock_count++; goto out; } - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ - if (unlikely(page_evictable(page))) + if (unlikely(folio_evictable(folio))) goto out; - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - add_page_to_lru_list(page, lruvec); - SetPageLRU(page); + lruvec_add_folio(lruvec, folio); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { - int nr_pages = thp_nr_pages(page); + int nr_pages = folio_nr_pages(folio); bool isolated = false; - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (PageUnevictable(page)) { + if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ - if (page->mlock_count) - page->mlock_count--; - if (page->mlock_count) + if (folio->mlock_count) + folio->mlock_count--; + if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: - if (TestClearPageMlocked(page)) { - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (isolated || !PageUnevictable(page)) + if (folio_test_clear_mlocked(folio)) { + __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); + if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } - /* page_evictable() has to be checked *after* clearing Mlocked */ - if (isolated && PageUnevictable(page) && page_evictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + /* folio_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } /* - * Flags held in the low bits of a struct page pointer on the mlock_pvec. + * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ -#define LRU_PAGE 0x1 -#define NEW_PAGE 0x2 -static inline struct page *mlock_lru(struct page *page) +#define LRU_FOLIO 0x1 +#define NEW_FOLIO 0x2 +static inline struct folio *mlock_lru(struct folio *folio) { - return (struct page *)((unsigned long)page + LRU_PAGE); + return (struct folio *)((unsigned long)folio + LRU_FOLIO); } -static inline struct page *mlock_new(struct page *page) +static inline struct folio *mlock_new(struct folio *folio) { - return (struct page *)((unsigned long)page + NEW_PAGE); + return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* - * mlock_pagevec() is derived from pagevec_lru_move_fn(): - * perhaps that can make use of such page pointer flags in future, - * but for now just keep it for mlock. We could use three separate - * pagevecs instead, but one feels better (munlocking a full pagevec - * does not need to drain mlocking pagevecs first). + * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can + * make use of such folio pointer flags in future, but for now just keep it for + * mlock. We could use three separate folio batches instead, but one feels + * better (munlocking a full folio batch does not need to drain mlocking folio + * batches first). */ -static void mlock_pagevec(struct pagevec *pvec) +static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; - struct page *page; + struct folio *folio; int i; - for (i = 0; i < pagevec_count(pvec); i++) { - page = pvec->pages[i]; - mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); - page = (struct page *)((unsigned long)page - mlock); - pvec->pages[i] = page; + for (i = 0; i < folio_batch_count(fbatch); i++) { + folio = fbatch->folios[i]; + mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); + folio = (struct folio *)((unsigned long)folio - mlock); + fbatch->folios[i] = folio; - if (mlock & LRU_PAGE) - lruvec = __mlock_page(page, lruvec); - else if (mlock & NEW_PAGE) - lruvec = __mlock_new_page(page, lruvec); + if (mlock & LRU_FOLIO) + lruvec = __mlock_folio(folio, lruvec); + else if (mlock & NEW_FOLIO) + lruvec = __mlock_new_folio(folio, lruvec); else - lruvec = __munlock_page(page, lruvec); + lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); + release_pages(fbatch->folios, fbatch->nr); + folio_batch_reinit(fbatch); } void mlock_page_drain_local(void) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } void mlock_page_drain_remote(int cpu) { - struct pagevec *pvec; + struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); - pvec = &per_cpu(mlock_pvec.vec, cpu); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); + fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); } bool need_mlock_page_drain(int cpu) { - return pagevec_count(&per_cpu(mlock_pvec.vec, cpu)); + return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** @@ -242,10 +242,10 @@ bool need_mlock_page_drain(int cpu) */ void mlock_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); @@ -255,10 +255,10 @@ void mlock_folio(struct folio *folio) } folio_get(folio); - if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** @@ -267,20 +267,22 @@ void mlock_folio(struct folio *folio) */ void mlock_new_page(struct page *page) { - struct pagevec *pvec; - int nr_pages = thp_nr_pages(page); + struct folio_batch *fbatch; + struct folio *folio = page_folio(page); + int nr_pages = folio_nr_pages(folio); - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - SetPageMlocked(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + folio_set_mlocked(folio); + + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - get_page(page); - if (!pagevec_add(pvec, mlock_new(page)) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, mlock_new(folio)) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** @@ -289,20 +291,20 @@ void mlock_new_page(struct page *page) */ void munlock_page(struct page *page) { - struct pagevec *pvec; + struct folio_batch *fbatch; + struct folio *folio = page_folio(page); - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* - * TestClearPageMlocked(page) must be left to __munlock_page(), - * which will check whether the page is multiply mlocked. + * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), + * which will check whether the folio is multiply mlocked. */ - - get_page(page); - if (!pagevec_add(pvec, page) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, folio) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, From b213ef6b72b5f1f9d333f8aca05440db4e302b46 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:30 +0000 Subject: [PATCH 200/505] m68k/mm/motorola: specify pmd_page() type Failing to specify a specific type here breaks anything that relies on the type being explicitly known, such as page_folio(). Make explicit the type of null pointer returned here. Link: https://lkml.kernel.org/r/ad6be2821bbd6af10966b3704568ff458b270d9c.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- arch/m68k/include/asm/motorola_pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 7ac3d64c6b33..562b54e09850 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -124,7 +124,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) * expects pmd_page() to exists, only to then DCE it all. Provide a dummy to * make the compiler happy. */ -#define pmd_page(pmd) NULL +#define pmd_page(pmd) ((struct page *)NULL) #define pud_none(pud) (!pud_val(pud)) From 96f97c438f61ddba94117dcd1a1eb0aaafa22309 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:31 +0000 Subject: [PATCH 201/505] mm: mlock: update the interface to use folios Update the mlock interface to accept folios rather than pages, bringing the interface in line with the internal implementation. munlock_vma_page() still requires a page_folio() conversion, however this is consistent with the existent mlock_vma_page() implementation and a product of rmap still dealing in pages rather than folios. Link: https://lkml.kernel.org/r/cba12777c5544305014bc0cbec56bb4cc71477d8.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/internal.h | 38 ++++++++++++++++++++++---------------- mm/migrate.c | 2 +- mm/mlock.c | 38 ++++++++++++++++++-------------------- mm/page_alloc.c | 2 +- mm/rmap.c | 4 ++-- mm/swap.c | 10 +++++----- 6 files changed, 49 insertions(+), 45 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 583e15357e09..973b48e8b1af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -533,10 +533,9 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -565,18 +564,25 @@ static inline void mlock_vma_page(struct page *page, mlock_vma_folio(page_folio(page), vma, compound); } -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); + +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); + +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + munlock_vma_folio(page_folio(page), vma, compound); +} +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -665,10 +671,10 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/migrate.c b/mm/migrate.c index 98de7ce2b576..206fcdbe67f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -265,7 +265,7 @@ static bool remove_migration_pte(struct folio *folio, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); diff --git a/mm/mlock.c b/mm/mlock.c index f8e8d30ab08a..9e9c8be58277 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,7 +210,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) folio_batch_reinit(fbatch); } -void mlock_page_drain_local(void) +void mlock_drain_local(void) { struct folio_batch *fbatch; @@ -221,7 +221,7 @@ void mlock_page_drain_local(void) local_unlock(&mlock_fbatch.lock); } -void mlock_page_drain_remote(int cpu) +void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; @@ -231,7 +231,7 @@ void mlock_page_drain_remote(int cpu) mlock_folio_batch(fbatch); } -bool need_mlock_page_drain(int cpu) +bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } @@ -262,13 +262,12 @@ void mlock_folio(struct folio *folio) } /** - * mlock_new_page - mlock a newly allocated page not yet on LRU - * @page: page to be mlocked, either a normal page or a THP head. + * mlock_new_folio - mlock a newly allocated folio not yet on LRU + * @folio: folio to be mlocked, either normal or a THP head. */ -void mlock_new_page(struct page *page) +void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); @@ -286,13 +285,12 @@ void mlock_new_page(struct page *page) } /** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. + * munlock_folio - munlock a folio + * @folio: folio to be munlocked, either normal or a THP head. */ -void munlock_page(struct page *page) +void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); @@ -314,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -322,11 +320,11 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - page = pmd_page(*pmd); + folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); goto out; } @@ -334,15 +332,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; - if (PageTransCompound(page)) + if (folio_test_large(folio)) continue; if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); } pte_unmap(start_pte); out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88494e82843d..83be3b571fd0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8587,7 +8587,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); - mlock_page_drain_remote(cpu); + mlock_drain_remote(cpu); drain_pages(cpu); /* diff --git a/mm/rmap.c b/mm/rmap.c index a079d9964b9c..073999f78adf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1764,7 +1764,7 @@ discard: */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } @@ -2105,7 +2105,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } diff --git a/mm/swap.c b/mm/swap.c index e54e2a252e27..42d67f9baa8c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -562,7 +562,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(&folio->page); + mlock_new_folio(folio); else folio_add_lru(folio); } @@ -781,7 +781,7 @@ void lru_add_drain(void) local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } /* @@ -796,7 +796,7 @@ static void lru_add_and_bh_lrus_drain(void) lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); - mlock_page_drain_local(); + mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) @@ -805,7 +805,7 @@ void lru_add_drain_cpu_zone(struct zone *zone) lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } #ifdef CONFIG_SMP @@ -828,7 +828,7 @@ static bool cpu_needs_drain(unsigned int cpu) folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || - need_mlock_page_drain(cpu) || + need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } From a8265cd917a63c0a1e13caf07e9a0a024480372b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:32 +0000 Subject: [PATCH 202/505] Documentation/mm: update references to __m[un]lock_page() to *_folio() We now pass folios to these functions, so update the documentation accordingly. Additionally, correct the outdated reference to __pagevec_lru_add_fn(), the referenced action occurs in __munlock_folio() directly now, replace reference to lru_cache_add_inactive_or_unevictable() with the modern folio equivalent folio_add_lru_vma() and reference folio flags by the flag name rather than accessor. Link: https://lkml.kernel.org/r/898c487169d98a7f09c1c1e57a7dfdc2b3f6bf0f.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 4a0e158aa9ce..2a90d0721dd9 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -308,22 +308,22 @@ do end up getting faulted into this VM_LOCKED VMA, they will be handled in the fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +calls mlock_vma_page(), which calls mlock_folio() when the VMA is VM_LOCKED (unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better +it is a newly allocated anonymous page, folio_add_lru_vma() calls +mlock_new_folio() instead: similar to mlock_folio(), but can make better judgments, since this page is held exclusively and known not to be on LRU yet. -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +mlock_folio() sets PG_mlocked immediately, then places the page on the CPU's +mlock folio batch, to batch up the rest of the work to be done under lru_lock by +__mlock_folio(). __mlock_folio() sets PG_unevictable, initializes mlock_count and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. +mlock_count in place of LRU threading). Or if the page was already PG_lru +and PG_unevictable and PG_mlocked, it simply increments the mlock_count. But in practice that may not work ideally: the page may not yet be on an LRU, or it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +field cannot be touched, but will be set to 0 later when __munlock_folio() returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: rather than risk stranding a page indefinitely as unevictable, always err with mlock_count on the low side, so that when munlocked the page will be rescued to @@ -377,8 +377,8 @@ that it is munlock() being performed. munlock_page() uses the mlock pagevec to batch up work to be done under lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +mlock_count, and when that reaches 0 it clears PG_mlocked and clears +PG_unevictable, moving the page from unevictable state to inactive LRU. But in practice that may not work ideally: the page may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In @@ -488,8 +488,8 @@ munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED munlock_page() uses the mlock pagevec to batch up work to be done under lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +mlock_count, and when that reaches 0 it clears PG_mlocked and clears +PG_unevictable, moving the page from unevictable state to inactive LRU. But in practice that may not work ideally: the page may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In @@ -515,7 +515,7 @@ munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages present, if one of those pages were unmapped by truncation or hole punch before mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to +still appear as PG_mlocked after it has been fully unmapped: and it is left to release_pages() (or __page_cache_release()) to clear it and update statistics before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, which is usually 0). @@ -527,7 +527,7 @@ Page Reclaim in shrink_*_list() vmscan's shrink_active_list() culls any obviously unevictable pages - i.e. !page_evictable(page) pages - diverting those to the unevictable list. However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable +active/inactive LRU lists. Note that these pages do not have PG_unevictable set - otherwise they would be on the unevictable list and shrink_active_list() would never see them. From 62a9bbf2e999b9dca148bd342ab7a29fcf0cb120 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 12 Jan 2023 11:31:47 +0100 Subject: [PATCH 203/505] kmsan: silence -Wmissing-prototypes warnings When building the kernel with W=1, the compiler reports numerous warnings about the missing prototypes for KMSAN instrumentation hooks. Because these functions are not supposed to be called explicitly by the kernel code (calls to them are emitted by the compiler), they do not have to be declared in the headers. Instead, we add forward declarations right before the definitions to silence the warnings produced by -Wmissing-prototypes. Link: https://lkml.kernel.org/r/20230112103147.382416-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: Vlastimil Babka Suggested-by: Marco Elver Reviewed-by: Marco Elver Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202301020356.dFruA4I5-lkp@intel.com/T/ Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- mm/kmsan/instrumentation.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 770fe02904f3..cf12e9616b24 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -38,7 +38,15 @@ get_shadow_origin_ptr(void *addr, u64 size, bool store) return ret; } +/* + * KMSAN instrumentation functions follow. They are not declared elsewhere in + * the kernel code, so they are preceded by prototypes, to silence + * -Wmissing-prototypes warnings. + */ + /* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) { @@ -47,6 +55,8 @@ struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); /* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) { @@ -59,12 +69,16 @@ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); * with fixed size. */ #define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ void *addr) \ { \ return get_shadow_origin_ptr(addr, size, /*store*/ false); \ } \ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ void *addr) \ { \ @@ -86,6 +100,7 @@ DECLARE_METADATA_PTR_GETTER(8); * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure * the memory written to in these cases is also marked as initialized. */ +void __msan_instrument_asm_store(void *addr, uintptr_t size); void __msan_instrument_asm_store(void *addr, uintptr_t size) { unsigned long ua_flags; @@ -138,6 +153,7 @@ static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) } /* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n); void *__msan_memmove(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -162,6 +178,7 @@ void *__msan_memmove(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memmove); /* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n); void *__msan_memcpy(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -188,6 +205,7 @@ void *__msan_memcpy(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memcpy); /* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n); void *__msan_memset(void *dst, int c, uintptr_t n) { depot_stack_handle_t origin; @@ -217,6 +235,7 @@ EXPORT_SYMBOL(__msan_memset); * uninitialized value to memory. When reporting an error, KMSAN unrolls and * prints the whole chain of stores that preceded the use of this value. */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin); depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) { depot_stack_handle_t ret = 0; @@ -237,6 +256,7 @@ depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) EXPORT_SYMBOL(__msan_chain_origin); /* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr); void __msan_poison_alloca(void *address, uintptr_t size, char *descr) { depot_stack_handle_t handle; @@ -272,6 +292,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr) EXPORT_SYMBOL(__msan_poison_alloca); /* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size); void __msan_unpoison_alloca(void *address, uintptr_t size) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -287,6 +308,7 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); * Report that an uninitialized value with the given origin was used in a way * that constituted undefined behavior. */ +void __msan_warning(u32 origin); void __msan_warning(u32 origin) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -303,6 +325,7 @@ EXPORT_SYMBOL(__msan_warning); * At the beginning of an instrumented function, obtain the pointer to * `struct kmsan_context_state` holding the metadata for function parameters. */ +struct kmsan_context_state *__msan_get_context_state(void); struct kmsan_context_state *__msan_get_context_state(void) { return &kmsan_get_context()->cstate; From 92644f583d5124b60bc20a3dd21b0bc9142f020c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 13 Jan 2023 16:15:55 -0800 Subject: [PATCH 204/505] mm/khugepaged: introduce release_pte_folio() to replace release_pte_page() release_pte_page() is converted to be a wrapper for release_pte_folio() to help facilitate the khugepaged conversion to folios. This replaces 3 calls to compound_head() with 1, and saves 85 bytes of kernel text. Link: https://lkml.kernel.org/r/20230114001556.43795-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 57164c15e076..d7b993e53711 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -490,13 +490,18 @@ void __khugepaged_exit(struct mm_struct *mm) } } +static void release_pte_folio(struct folio *folio) +{ + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + -folio_nr_pages(folio)); + folio_unlock(folio); + folio_putback_lru(folio); +} + static void release_pte_page(struct page *page) { - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - -compound_nr(page)); - unlock_page(page); - putback_lru_page(page); + release_pte_folio(page_folio(page)); } static void release_pte_pages(pte_t *pte, pte_t *_pte, From 9bdfeea46f4926181b9476037c6af28d6d19cc28 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 13 Jan 2023 16:15:56 -0800 Subject: [PATCH 205/505] mm/khugepaged: convert release_pte_pages() to use folios Converts release_pte_pages() to use folios instead of pages. Link: https://lkml.kernel.org/r/20230114001556.43795-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b993e53711..b39ab219d5b7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -507,20 +507,20 @@ static void release_pte_page(struct page *page) static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { - struct page *page, *tmp; + struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = *_pte; - page = pte_page(pteval); + folio = pfn_folio(pte_pfn(pteval)); if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !PageCompound(page)) - release_pte_page(page); + !folio_test_large(folio)) + release_pte_folio(folio); } - list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { - list_del(&page->lru); - release_pte_page(page); + list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { + list_del(&folio->lru); + release_pte_folio(folio); } } From 2321ba3e3733f513e46e29b9c70512ecddbf1085 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:01 +0100 Subject: [PATCH 206/505] mm/debug_vm_pgtable: more pte_swp_exclusive() sanity checks Patch series "mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures with swap PTEs". This is the follow-up on [1]: [PATCH v2 0/8] mm: COW fixes part 3: reliable GUP R/W FOLL_GET of anonymous pages After we implemented __HAVE_ARCH_PTE_SWP_EXCLUSIVE on most prominent enterprise architectures, implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all remaining architectures that support swap PTEs. This makes sure that exclusive anonymous pages will stay exclusive, even after they were swapped out -- for example, making GUP R/W FOLL_GET of anonymous pages reliable. Details can be found in [1]. This primarily fixes remaining known O_DIRECT memory corruptions that can happen on concurrent swapout, whereby we can lose DMA reads to a page (modifying the user page by writing to it). To verify, there are two test cases (requiring swap space, obviously): (1) The O_DIRECT+swapout test case [2] from Andrea. This test case tries triggering a race condition. (2) My vmsplice() test case [3] that tries to detect if the exclusive marker was lost during swapout, not relying on a race condition. For example, on 32bit x86 (with and without PAE), my test case fails without these patches: $ ./test_swp_exclusive FAIL: page was replaced during COW But succeeds with these patches: $ ./test_swp_exclusive PASS: page was not replaced during COW Why implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE for all architectures, even the ones where swap support might be in a questionable state? This is the first step towards removing "readable_exclusive" migration entries, and instead using pte_swp_exclusive() also with (readable) migration entries instead (as suggested by Peter). The only missing piece for that is supporting pmd_swp_exclusive() on relevant architectures with THP migration support. As all relevant architectures now implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE,, we can drop __HAVE_ARCH_PTE_SWP_EXCLUSIVE in the last patch. I tried cross-compiling all relevant setups and tested on x86 and sparc64 so far. CCing arch maintainers only on this cover letter and on the respective patch(es). [1] https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com [2] https://gitlab.com/aarcange/kernel-testcases-for-v5.11/-/blob/main/page_count_do_wp_page-swap.c [3] https://gitlab.com/davidhildenbrand/scratchspace/-/blob/main/test_swp_exclusive.c This patch (of 26): We want to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE on all architectures. Let's extend our sanity checks, especially testing that our PTE bit does not affect: * is_swap_pte() -> pte_present() and pte_none() * the swap entry + type * pte_swp_soft_dirty() Especially, the pfn_pte() is dodgy when the swap PTE layout differs heavily from ordinary PTEs. Let's properly construct a swap PTE from swap type+offset. [david@redhat.com: fix build] Link: https://lkml.kernel.org/r/6aaad548-cf48-77fa-9d6c-db83d724b2eb@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-1-david@redhat.com Link: https://lkml.kernel.org/r/20230113171026.582290-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anton Ivanov Cc: Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: H. Peter Anvin (Intel) Cc: Huacai Chen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jason Gunthorpe Cc: Johannes Berg Cc: John Hubbard Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Xuerui Wang Cc: Yang Shi Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bb3328f46126..ff8d6f6af896 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -811,13 +811,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { #ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + unsigned long max_swap_offset; + swp_entry_t entry, entry2; + pte_t pte; pr_debug("Validating PTE swap exclusive\n"); + + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + + /* Create a swp entry with all possible bits set */ + entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + + pte = swp_entry_to_pte(entry); + WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_mkexclusive(pte); WARN_ON(!pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(pte_swp_soft_dirty(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_clear_exclusive(pte); WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); #endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } From a172d5128706028ac07b8db709728379ecc72f6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:02 +0100 Subject: [PATCH 207/505] alpha/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, mask the type in mk_swap_pte() as well. Link: https://lkml.kernel.org/r/20230113171026.582290-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 41 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 9e45f6735d5d..970abf511b13 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -74,6 +74,9 @@ struct vm_area_struct; #define _PAGE_DIRTY 0x20000 #define _PAGE_ACCESSED 0x40000 +/* We borrow bit 39 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x8000000000UL + /* * NOTE! The "accessed" bit isn't necessarily exact: it can be kept exactly * by software (use the KRE/URE/KWE/UWE bits appropriately), but I'll fake it. @@ -301,18 +304,48 @@ extern inline void update_mmu_cache(struct vm_area_struct * vma, } /* - * Non-present pages: high 24 bits are offset, next 8 bits type, - * low 32 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------- offset ------------------> E <--- type --> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------------------- zeroes --------------------------> + * + * E is the exclusive marker that is not stored in swap entries. */ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 32) | (offset << 40); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 32) | (offset << 40); return pte; } -#define __swp_type(x) (((x).val >> 32) & 0xff) +#define __swp_type(x) (((x).val >> 32) & 0x7f) #define __swp_offset(x) ((x).val >> 40) #define __swp_entry(type, off) ((swp_entry_t) { pte_val(mk_swap_pte((type), (off))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ From 4a446b3dd335d0bd14a5ca3e563688de3637be0c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:03 +0100 Subject: [PATCH 208/505] arc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 5, which is yet unused. The only important parts seems to be to not use _PAGE_PRESENT (bit 9). Link: https://lkml.kernel.org/r/20230113171026.582290-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/include/asm/pgtable-bits-arcv2.h | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 515e82db519f..611f412713b9 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -26,6 +26,9 @@ #define _PAGE_GLOBAL (1 << 8) /* ASID agnostic (H) */ #define _PAGE_PRESENT (1 << 9) /* PTE/TLB Valid (H) */ +/* We borrow bit 5 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + #ifdef CONFIG_ARC_MMU_V4 #define _PAGE_HW_SZ (1 << 10) /* Normal/super (H) */ #else @@ -106,9 +109,18 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -/* Encode swap {type,off} tuple into PTE - * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that - * PAGE_PRESENT is zero in a PTE holding swap "identifier" +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset -------------> <--- zero --> E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. */ #define __swp_entry(type, off) ((swp_entry_t) \ { ((type) & 0x1f) | ((off) << 13) }) @@ -120,6 +132,15 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(swp_mkexclusive, |= (_PAGE_SWP_EXCLUSIVE)); +PTE_BIT_FUNC(swp_clear_exclusive, &= ~(_PAGE_SWP_EXCLUSIVE)); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #include #endif From 20aae9eff5acd8f50f72adca1176f9269a46b827 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:04 +0100 Subject: [PATCH 209/505] arm/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 64 GiB (was 128 GiB). While at it drop the PTE_TYPE_FAULT from __swp_entry_to_pte() which is defined to be 0 and is rather confusing because we should be dealing with "Linux PTEs" not "hardware PTEs". Also, properly mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable-2level.h | 3 +++ arch/arm/include/asm/pgtable-3level.h | 3 +++ arch/arm/include/asm/pgtable.h | 35 +++++++++++++++++++++------ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 92abd4cd8ca2..ce543cd9380c 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -126,6 +126,9 @@ #define L_PTE_SHARED (_AT(pteval_t, 1) << 10) /* shared(v6), coherent(xsc3) */ #define L_PTE_NONE (_AT(pteval_t, 1) << 11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE L_PTE_RDONLY + /* * These are the memory types, defined to be compatible with * pre-ARMv6 CPUs cacheable and bufferable bits: n/a,n/a,C,B diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index eabe72ff7381..106049791500 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -76,6 +76,9 @@ #define L_PTE_NONE (_AT(pteval_t, 1) << 57) /* PROT_NONE */ #define L_PTE_RDONLY (_AT(pteval_t, 1) << 58) /* READ ONLY */ +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 7) + #define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) #define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f049072b2e85..886c275995a2 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -271,27 +271,48 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } /* - * Encode and decode a swap entry. Swap entries are stored in the Linux - * page tables as follows: + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset ------------------------> < type -> 0 0 + * <------------------- offset ------------------> E < type -> 0 0 * - * This gives us up to 31 swap files and 128GB per swap file. Note that + * E is the exclusive marker that is not stored in swap entries. + * + * This gives us up to 31 swap files and 64GB per swap file. Note that * the offset field is always non-zero. */ #define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) +#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT + 1) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_BITS) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(swp) __pte((swp).val | PTE_TYPE_FAULT) +#define __swp_entry_to_pte(swp) __pte((swp).val) + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} /* * It is an error for the kernel to have more swap files than we can From 41e0d49104dbff888ef6446ea46842fde66c0a76 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:05 +0100 Subject: [PATCH 210/505] csky/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 16 GiB (was 32 GiB). We might actually be able to reuse one of the other software bits (_PAGE_READ / PAGE_WRITE) instead, because we only have to keep pte_present(), pte_none() and HW happy. For now, let's keep it simple because there might be something non-obvious. Link: https://lkml.kernel.org/r/20230113171026.582290-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Guo Ren Signed-off-by: Andrew Morton --- arch/csky/abiv1/inc/abi/pgtable-bits.h | 13 +++++++++---- arch/csky/abiv2/inc/abi/pgtable-bits.h | 19 ++++++++++++------- arch/csky/include/asm/pgtable.h | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/pgtable-bits.h b/arch/csky/abiv1/inc/abi/pgtable-bits.h index 752c8b3f9194..ae7a2f76dd42 100644 --- a/arch/csky/abiv1/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv1/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_ACCESSED (1<<3) #define _PAGE_MODIFIED (1<<4) +/* We borrow bit 9 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<9) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<6) #define _PAGE_VALID (1<<7) @@ -26,7 +29,8 @@ #define _PAGE_PROT_NONE _PAGE_READ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) @@ -35,15 +39,16 @@ * bit 6: _PAGE_GLOBAL (zero) * bit 7: _PAGE_VALID (zero) * bit 8: swap type[4] - * bit 9 - 31: swap offset + * bit 9: exclusive marker + * bit 10 - 31: swap offset */ #define __swp_type(x) ((((x).val >> 2) & 0xf) | \ (((x).val >> 4) & 0x10)) -#define __swp_offset(x) ((x).val >> 9) +#define __swp_offset(x) ((x).val >> 10) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0xf) << 2) | \ ((type & 0x10) << 4) | \ - ((offset) << 9)}) + ((offset) << 10)}) #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/csky/abiv2/inc/abi/pgtable-bits.h b/arch/csky/abiv2/inc/abi/pgtable-bits.h index 7e7f389f546f..526152bd2156 100644 --- a/arch/csky/abiv2/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv2/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_PRESENT (1<<10) #define _PAGE_MODIFIED (1<<11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<7) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<0) #define _PAGE_VALID (1<<1) @@ -26,23 +29,25 @@ #define _PAGE_PROT_NONE _PAGE_WRITE /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_GLOBAL (zero) * bit 1: _PAGE_VALID (zero) * bit 2 - 6: swap type - * bit 7 - 8: swap offset[0 - 1] + * bit 7: exclusive marker + * bit 8: swap offset[0] * bit 9: _PAGE_WRITE (zero) * bit 10: _PAGE_PRESENT (zero) - * bit 11 - 31: swap offset[2 - 22] + * bit 11 - 31: swap offset[1 - 21] */ #define __swp_type(x) (((x).val >> 2) & 0x1f) -#define __swp_offset(x) ((((x).val >> 7) & 0x3) | \ - (((x).val >> 9) & 0x7ffffc)) +#define __swp_offset(x) ((((x).val >> 8) & 0x1) | \ + (((x).val >> 10) & 0x3ffffe)) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0x1f) << 2) | \ - ((offset & 0x3) << 7) | \ - ((offset & 0x7ffffc) << 9)}) + ((offset & 0x1) << 8) | \ + ((offset & 0x3ffffe) << 10)}) #endif /* __ASM_CSKY_PGTABLE_BITS_H */ diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 77bc6caff2d2..574c97b9ecca 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,6 +200,24 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, From 61f4a896e62dee8581fea843479058507fda57fb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:06 +0100 Subject: [PATCH 211/505] hexagon/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file to 16 GiB (was 32 GiB). While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Brian Cain Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/pgtable.h | 37 +++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index f7048c18b6f9..7eb008e477c8 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -61,6 +61,9 @@ extern unsigned long empty_zero_page; * So we'll put up with a bit of inefficiency for now... */ +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<6) + /* * Top "FOURTH" level (pgd), which for the Hexagon VM is really * only the second from the bottom, pgd and pud both being collapsed. @@ -359,9 +362,12 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as - * swap type/offset tuple. Rather than have the TLB fill handler test + * listed below. Rather than have the TLB fill handler test * _PAGE_PRESENT, we're going to reserve the permissions bits and set them to * all zeros for swap entries, which speeds up the miss handler at the cost of * 3 bits of offset. That trade-off can be revisited if necessary, but Hexagon @@ -371,9 +377,10 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Format of swap PTE: * bit 0: Present (zero) * bits 1-5: swap type (arch independent layer uses 5 bits max) - * bits 6-9: bits 3:0 of offset + * bit 6: exclusive marker + * bits 7-9: bits 2:0 of offset * bits 10-12: effectively _PAGE_PROTNONE (all zero) - * bits 13-31: bits 22:4 of swap offset + * bits 13-31: bits 21:3 of swap offset * * The split offset makes some of the following macros a little gnarly, * but there's plenty of precedent for this sort of thing. @@ -383,11 +390,29 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_type(swp_pte) (((swp_pte).val >> 1) & 0x1f) #define __swp_offset(swp_pte) \ - ((((swp_pte).val >> 6) & 0xf) | (((swp_pte).val >> 9) & 0x7ffff0)) + ((((swp_pte).val >> 7) & 0x7) | (((swp_pte).val >> 10) & 0x3ffff8)) #define __swp_entry(type, offset) \ ((swp_entry_t) { \ - ((type << 1) | \ - ((offset & 0x7ffff0) << 9) | ((offset & 0xf) << 6)) }) + (((type & 0x1f) << 1) | \ + ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} #endif From 3151cc26565ea864c51a78900101ad68b864f405 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:07 +0100 Subject: [PATCH 212/505] ia64/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, also mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- arch/ia64/include/asm/pgtable.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 01517a5e6778..e4b8ab931399 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -58,6 +58,9 @@ #define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */ #define _PAGE_PROTNONE (__IA64_UL(1) << 63) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #define _PFN_MASK _PAGE_PPN_MASK /* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */ #define _PAGE_CHG_MASK (_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED) @@ -399,6 +402,9 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init (void); /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of * bits in the swap-type field of the swap pte. It would be nice to * enforce that, but we can't easily include here. @@ -406,16 +412,36 @@ extern void paging_init (void); * * Format of swap pte: * bit 0 : present bit (must be zero) - * bits 1- 7: swap-type + * bits 1- 6: swap type + * bit 7 : exclusive marker * bits 8-62: swap offset * bit 63 : _PAGE_PROTNONE bit */ -#define __swp_type(entry) (((entry).val >> 1) & 0x7f) +#define __swp_type(entry) (((entry).val >> 1) & 0x3f) #define __swp_offset(entry) (((entry).val << 1) >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 1) | ((long) (offset) << 8) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type & 0x3f) << 1) | \ + ((long) (offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. From ad3150f11b099321331f2f74d008e86ab2a04c7a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:08 +0100 Subject: [PATCH 213/505] loongarch/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, also mask the type in mk_swap_pte(). Note that this bit does not conflict with swap PMDs and could also be used in swap PMD context later. Link: https://lkml.kernel.org/r/20230113171026.582290-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Huacai Chen Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable-bits.h | 4 +++ arch/loongarch/include/asm/pgtable.h | 39 ++++++++++++++++++++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 3d1e0a69975a..8b98d22a145b 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -20,6 +20,7 @@ #define _PAGE_SPECIAL_SHIFT 11 #define _PAGE_HGLOBAL_SHIFT 12 /* HGlobal is a PMD bit */ #define _PAGE_PFN_SHIFT 12 +#define _PAGE_SWP_EXCLUSIVE_SHIFT 23 #define _PAGE_PFN_END_SHIFT 48 #define _PAGE_NO_READ_SHIFT 61 #define _PAGE_NO_EXEC_SHIFT 62 @@ -33,6 +34,9 @@ #define _PAGE_PROTNONE (_ULCAST_(1) << _PAGE_PROTNONE_SHIFT) #define _PAGE_SPECIAL (_ULCAST_(1) << _PAGE_SPECIAL_SHIFT) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (_ULCAST_(1) << _PAGE_SWP_EXCLUSIVE_SHIFT) + /* Used by TLB hardware (placed in EntryLo*) */ #define _PAGE_VALID (_ULCAST_(1) << _PAGE_VALID_SHIFT) #define _PAGE_DIRTY (_ULCAST_(1) << _PAGE_DIRTY_SHIFT) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 7a34e900d8c1..c6b8fe7ac43c 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -249,13 +249,26 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <--- type ---> <---------- zeroes ----------> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT and _PAGE_PROTNONE. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) @@ -263,6 +276,24 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void paging_init(void); #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) From ad464ff2c0f91fcacc24167fc435aa45fe0b7d1b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:09 +0100 Subject: [PATCH 214/505] m68k/mm: remove dummy __swp definitions for nommu The definitions are not required, let's remove them. Link: https://lkml.kernel.org/r/20230113171026.582290-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Greg Ungerer Signed-off-by: Andrew Morton --- arch/m68k/include/asm/pgtable_no.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fed58da3a6b6..fc044df52b96 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -31,12 +31,6 @@ extern void paging_init(void); #define swapper_pg_dir ((pgd_t *) 0) -#define __swp_type(x) (0) -#define __swp_offset(x) (0) -#define __swp_entry(typ,off) ((swp_entry_t) { ((typ) | ((off) << 7)) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. From ed4154067a086c44608b08f55cceb2688b661750 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:10 +0100 Subject: [PATCH 215/505] m68k/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, make sure for sun3 that the valid bit never gets set by properly masking it off and mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Greg Ungerer Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgtable.h | 36 ++++++++++++++++++++-- arch/m68k/include/asm/motorola_pgtable.h | 38 +++++++++++++++++++++-- arch/m68k/include/asm/sun3_pgtable.h | 39 ++++++++++++++++++++++-- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index b619b22823f8..3f8f4d0e66dd 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -46,6 +46,9 @@ #define _CACHEMASK040 (~0x060) #define _PAGE_GLOBAL040 0x400 /* 68040 global bit, used for kva descs */ +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x080 + /* * Externally used page protection values. */ @@ -254,15 +257,42 @@ static inline pte_t pte_mkcache(pte_t pte) extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; /* - * Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------ offset -------------> 0 0 0 E <-- type ---> + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(x) ((x).val & 0xFF) +#define __swp_type(x) ((x).val & 0x7f) #define __swp_offset(x) ((x).val >> 11) -#define __swp_entry(typ, off) ((swp_entry_t) { (typ) | \ +#define __swp_entry(typ, off) ((swp_entry_t) { ((typ) & 0x7f) | \ (off << 11) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 562b54e09850..c1782563e793 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -41,6 +41,9 @@ #define _PAGE_PROTNONE 0x004 +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x800 + #ifndef __ASSEMBLY__ /* This is the cache mode to be used for pages containing page descriptors for @@ -169,12 +172,41 @@ static inline pte_t pte_mkcache(pte_t pte) #define swapper_pg_dir kernel_pg_dir extern pgd_t kernel_pg_dir[128]; -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -#define __swp_type(x) (((x).val >> 4) & 0xff) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------> E <-- type ---> 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) (((x).val >> 4) & 0x7f) #define __swp_offset(x) ((x).val >> 12) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 12) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x7f) << 4) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* _MOTOROLA_PGTABLE_H */ diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 90d57e537eb1..dbfc9703b15d 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -71,6 +71,9 @@ #define SUN3_PMD_MASK (0x0000003F) #define SUN3_PMD_MAGIC (0x0000002B) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x040 + #ifndef __ASSEMBLY__ /* @@ -152,12 +155,42 @@ static inline pte_t pte_mkcache(pte_t pte) { return pte; } extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; -/* Macros to (de)construct the fake PTEs representing swap pages. */ -#define __swp_type(x) ((x).val & 0x7F) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * 0 <--------------------- offset ----------------> E <- type --> + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x3f) #define __swp_offset(x) (((x).val) >> 7) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) | ((offset) << 7)) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x3f) | \ + (((offset) << 7) & ~SUN3_PAGE_VALID)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* !_SUN3_PGTABLE_H */ From b5c88f21531c3457e3e202817556b68171484e00 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:11 +0100 Subject: [PATCH 216/505] microblaze/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. The shift by 2 when converting between PTE and arch-specific swap entry makes the swap PTE layout a little bit harder to decipher. While at it, drop the comment from paulus---copy-and-paste leftover from powerpc where we actually have _PAGE_HASHPTE---and mask the type in __swp_entry_to_pte() as well. Link: https://lkml.kernel.org/r/20230113171026.582290-12-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michal Simek Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgtable.h | 4 +-- arch/microblaze/include/asm/pgtable.h | 45 +++++++++++++++++++++------ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 3f8f4d0e66dd..e573d7b649f7 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -46,8 +46,8 @@ #define _CACHEMASK040 (~0x060) #define _PAGE_GLOBAL040 0x400 /* 68040 global bit, used for kva descs */ -/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ -#define _PAGE_SWP_EXCLUSIVE 0x080 +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE CF_PAGE_NOCACHE /* * Externally used page protection values. diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 42f5988e998b..7e3de54bf426 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -131,10 +131,10 @@ extern pte_t *va_to_pte(unsigned long address); * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 4xx doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 - * is cleared in the TLB miss handler before the TLB entry is loaded. + * - PRESENT *must* be in the bottom two bits because swap PTEs use the top + * 30 bits. Because 4xx doesn't support SMP anyway, M is irrelevant so we + * borrow it for PAGE_PRESENT. Bit 30 is cleared in the TLB miss handler + * before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for * software PTE bits. We actually use bits 21, 24, 25, and @@ -155,6 +155,9 @@ extern pte_t *va_to_pte(unsigned long address); #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ #define _PMD_PRESENT PAGE_MASK +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + /* * Some bits are unused... */ @@ -393,18 +396,40 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit, or the _PAGE_HASHPTE bit - * (if used). -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(entry) ((entry).val & 0x3f) +#define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 6) #define __swp_entry(type, offset) \ - ((swp_entry_t) { (type) | ((offset) << 6) }) + ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 6) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern unsigned long iopa(unsigned long addr); /* Values for nocacheflag and cmode */ From 83d3b2b46ea3f8fa542fb7528b3bca6f476d0fab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:12 +0100 Subject: [PATCH 217/505] mips/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. On 64bit, steal one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. On 32bit we're able to locate unused bits. As the PTE layout for 32 bit is very confusing, document it a bit better. While at it, mask the type in __swp_entry()/mk_swap_pte(). Link: https://lkml.kernel.org/r/20230113171026.582290-13-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgtable-32.h | 88 ++++++++++++++++++++++++++---- arch/mips/include/asm/pgtable-64.h | 23 ++++++-- arch/mips/include/asm/pgtable.h | 36 ++++++++++++ 3 files changed, 131 insertions(+), 16 deletions(-) diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index b40a0e69fccc..ba0016709a1a 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -191,49 +191,113 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #define pte_page(x) pfn_to_page(pte_pfn(x)) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + */ #if defined(CONFIG_CPU_R3K_TLB) -/* Swap entries must have VALID bit cleared. */ +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------- offset ------------> < type -> V G E 0 0 0 0 0 0 P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 10) & 0x1f) #define __swp_offset(x) ((x).val >> 15) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 10) | ((offset) << 15) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 10) | ((offset) << 15) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else #if defined(CONFIG_XPA) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * 0 0 0 0 0 0 E P <------------------ zeroes -------------------> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> < type -> V G 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 4) & 0x1f) #define __swp_offset(x) ((x).val >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 9) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 4) | ((offset) << 9) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 57 (bit 25 in the low PTE) to store the exclusive marker in + * swap PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 25) + #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------ zeroes -------------------> E P 0 0 0 0 0 0 + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- offset --------------------> < type -> V G + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 7) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 7) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 2) | ((offset) << 7) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 39 (bit 7 in the low PTE) to store the exclusive marker in swap + * PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else /* - * Constraints: - * _PAGE_PRESENT at bit 0 - * _PAGE_MODIFIED at bit 4 - * _PAGE_GLOBAL at bit 6 - * _PAGE_VALID at bit 7 + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------- offset --------------> < type -> 0 0 0 0 0 0 E P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. The location of V and G varies. */ #define __swp_type(x) (((x).val >> 8) & 0x1f) #define __swp_offset(x) ((x).val >> 13) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 1) + #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #endif /* defined(CONFIG_CPU_R3K_TLB) */ diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index c6310192b654..98e24e3e7f2b 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -320,16 +320,31 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <-- type ---> <---------- zeroes -----------> + * + * E is the exclusive marker that is not stored in swap entries. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 23) + #endif /* _ASM_PGTABLE_64_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index a68c0b01d8cd..711874cee8e4 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -528,6 +528,42 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } #endif +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte.pte_low |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte.pte_low &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#else +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#endif extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); From 0a9ad8273ff4643dd50ff6b3ec991ab270e87a89 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:13 +0100 Subject: [PATCH 218/505] nios2/mm: refactor swap PTE layout nios2 disables swap for a good reason: it doesn't even provide sufficient type bits as required by core MM. However, swap entries are nowadays also used for other purposes (migration entries, PTE markers, HWPoison, ...), and accidential use could be problematic. Let's properly use 5 bits for the swap type and document the layout. Bits 26--31 should get ignored by hardware completely, so they can be used. Link: https://lkml.kernel.org/r/20230113171026.582290-14-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index ab793bc517f5..d1e5c9eb4643 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -232,19 +232,21 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) __FILE__, __LINE__, pgd_val(e)) /* - * Encode and decode a swap entry (must be !pte_none(pte) && !pte_present(pte): + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * - * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 ... 1 0 - * 0 0 0 0 type. 0 0 0 0 0 0 offset......... + * Format of swap PTEs: * - * This gives us up to 2**2 = 4 swap files and 2**20 * 4K = 4G per swap file. + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * 0 < type -> 0 0 0 0 0 0 <-------------- offset ---------------> * - * Note that the offset field is always non-zero, thus !pte_none(pte) is always - * true. + * Note that the offset field is always non-zero if the swap type is 0, thus + * !pte_none() is always true. */ -#define __swp_type(swp) (((swp).val >> 26) & 0x3) +#define __swp_type(swp) (((swp).val >> 26) & 0x1f) #define __swp_offset(swp) ((swp).val & 0xfffff) -#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x3) << 26) \ +#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x1f) << 26) \ | ((off) & 0xfffff) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) From 4d1d955f7c0cdbb7562d19049f859c74276fdfeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:14 +0100 Subject: [PATCH 219/505] nios2/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using the yet-unused bit 31. Link: https://lkml.kernel.org/r/20230113171026.582290-15-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable-bits.h | 3 +++ arch/nios2/include/asm/pgtable.h | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/nios2/include/asm/pgtable-bits.h b/arch/nios2/include/asm/pgtable-bits.h index bfddff383e89..724f9b08b1d1 100644 --- a/arch/nios2/include/asm/pgtable-bits.h +++ b/arch/nios2/include/asm/pgtable-bits.h @@ -31,4 +31,7 @@ #define _PAGE_ACCESSED (1<<26) /* page referenced */ #define _PAGE_DIRTY (1<<27) /* dirty page */ +/* We borrow bit 31 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<31) + #endif /* _ASM_NIOS2_PGTABLE_BITS_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index d1e5c9eb4643..05999da01731 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -239,7 +239,9 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * 0 < type -> 0 0 0 0 0 0 <-------------- offset ---------------> + * E < type -> 0 0 0 0 0 0 <-------------- offset ---------------> + * + * E is the exclusive marker that is not stored in swap entries. * * Note that the offset field is always non-zero if the swap type is 0, thus * !pte_none() is always true. @@ -251,6 +253,24 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void __init paging_init(void); extern void __init mmu_init(void); From 5ae3e74474f82613482ed67b0c234f61ac6ca2dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:15 +0100 Subject: [PATCH 220/505] openrisc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-16-david@redhat.com Signed-off-by: David Hildenbrand Cc: Stefan Kristiansson Cc: Stafford Horne Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/pgtable.h | 41 +++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 6477c17b3062..903b32d662ab 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -154,6 +154,9 @@ extern void paging_init(void); #define _KERNPG_TABLE \ (_PAGE_BASE | _PAGE_SRE | _PAGE_SWE | _PAGE_ACCESSED | _PAGE_DIRTY) +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_U_SHARED + #define PAGE_NONE __pgprot(_PAGE_ALL) #define PAGE_READONLY __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE) #define PAGE_READONLY_X __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE | _PAGE_EXEC) @@ -385,16 +388,44 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, /* __PHX__ FIXME, SWAP, this probably doesn't work */ -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -/* Since the PAGE_PRESENT bit is bit 4, we can use the bits above */ - -#define __swp_type(x) (((x).val >> 5) & 0x7f) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> E <- type --> 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. + */ +#define __swp_type(x) (((x).val >> 5) & 0x3f) #define __swp_offset(x) ((x).val >> 12) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 12) }) + ((swp_entry_t) { (((type) & 0x3f) << 5) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ From 6d239fc78c0b0c687e5408573350714e6e789d71 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:16 +0100 Subject: [PATCH 221/505] parisc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using the yet-unused _PAGE_ACCESSED location in the swap PTE. Looking at pte_present() and pte_none() checks, there seems to be no actual reason why we cannot use it: we only have to make sure we're not using _PAGE_PRESENT. Reusing this bit avoids having to steal one bit from the swap offset. Link: https://lkml.kernel.org/r/20230113171026.582290-17-david@redhat.com Signed-off-by: David Hildenbrand Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton --- arch/parisc/include/asm/pgtable.h | 41 ++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index ea357430aafe..3033bb88df34 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -218,6 +218,9 @@ extern void __update_cache(pte_t pte); #define _PAGE_KERNEL_RWX (_PAGE_KERNEL_EXEC | _PAGE_WRITE) #define _PAGE_KERNEL (_PAGE_KERNEL_RO | _PAGE_WRITE) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + /* The pgd/pmd contains a ptr (in phys addr space); since all pgds/pmds * are page-aligned, we don't care about the PAGE_OFFSET bits, except * for a few meta-information bits, so we shift the address to be @@ -394,17 +397,49 @@ extern void paging_init (void); #define update_mmu_cache(vms,addr,ptep) __update_cache(*ptep) -/* Encode and de-code a swap entry */ - +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <---------------- offset -----------------> P E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) must be 0. + * + * For the 64bit version, the offset is extended by 32bit. + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ( (((x).val >> 6) & 0x7) | \ (((x).val >> 8) & ~0x7) ) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | \ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) & 0x1f) | \ ((offset & 0x7) << 6) | \ ((offset & ~0x7) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; From 8897ebff37fd34920d380cbfafbfb47804eb4009 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:17 +0100 Subject: [PATCH 222/505] powerpc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit book3s We already implemented support for 64bit book3s in commit bff9beaa2e80 ("powerpc/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE for book3s") Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE also in 32bit by reusing yet unused LSB 2 / MSB 29. There seems to be no real reason why that bit cannot be used, and reusing it avoids having to steal one bit from the swap offset. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-18-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 38 +++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 75823f39e042..0ecb3a58f23f 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -42,6 +42,9 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +/* We borrow the _PAGE_USER bit to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + /* And here we include common definitions */ #define _PAGE_KERNEL_RO 0 @@ -363,17 +366,42 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used). - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <----------------- offset --------------------> < type -> E H P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) and __PAGE_HASHPTE (H) must be 0. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_read(pte_t pte) { return 1; } From 2bba2ffbe0303552142af944b891967ccf69a63b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:18 +0100 Subject: [PATCH 223/505] powerpc/nohash/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit and 64bit. On 64bit, let's use MSB 56 (LSB 7), located right next to the page type. On 32bit, let's use LSB 2 to avoid stealing one bit from the swap offset. There seems to be no real reason why these bits cannot be used for swap PTEs. The important part is that _PAGE_PRESENT and _PAGE_HASHPTE remain 0. While at it, mask the type in __swp_entry() and remove _PAGE_BIT_SWAP_TYPE from pte-e500.h: while it was used in 64bit code it was ignored in 32bit code. Link: https://lkml.kernel.org/r/20230113171026.582290-19-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/nohash/32/pgtable.h | 22 +++++++++++++---- arch/powerpc/include/asm/nohash/32/pte-40x.h | 6 ++--- arch/powerpc/include/asm/nohash/32/pte-44x.h | 18 ++++---------- arch/powerpc/include/asm/nohash/32/pte-85xx.h | 4 ++-- arch/powerpc/include/asm/nohash/64/pgtable.h | 24 ++++++++++++++++--- arch/powerpc/include/asm/nohash/pgtable.h | 16 +++++++++++++ arch/powerpc/include/asm/nohash/pte-e500.h | 1 - 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 70edad44dff6..fec56d965f00 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -360,18 +360,30 @@ static inline int pte_young(pte_t pte) #endif #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) + /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit. - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> < type -> E 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +/* We borrow LSB 2 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x000004 + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 2d3153cfc0d7..6fe46e754556 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -27,9 +27,9 @@ * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 40x doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 + * - PRESENT *must* be in the bottom two bits because swap PTEs + * use the top 30 bits. Because 40x doesn't support SMP anyway, M is + * irrelevant so we borrow it for PAGE_PRESENT. Bit 30 * is cleared in the TLB miss handler before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index 78bc304f750e..b7ed13cee137 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -56,20 +56,10 @@ * above bits. Note that the bit values are CPU specific, not architecture * specific. * - * The kernel PTE entry holds an arch-dependent swp_entry structure under - * certain situations. In other words, in such situations some portion of - * the PTE bits are used as a swp_entry. In the PPC implementation, the - * 3-24th LSB are shared with swp_entry, however the 0-2nd three LSB still - * hold protection values. That means the three protection bits are - * reserved for both PTE and SWAP entry at the most significant three - * LSBs. - * - * There are three protection bits available for SWAP entry: - * _PAGE_PRESENT - * _PAGE_HASHPTE (if HW has) - * - * So those three bits have to be inside of 0-2nd LSB of PTE. - * + * The kernel PTE entry can be an ordinary PTE mapping a page or a special swap + * PTE. In case of a swap PTE, LSB 2-24 are used to store information regarding + * the swap entry. However LSB 0-1 still hold protection values, for example, + * to distinguish swap PTEs from ordinary PTEs, and must be used with care. */ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 93fb8e11a3f1..16451df5ddb0 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -11,8 +11,8 @@ 32 33 34 35 36 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 RPN...................... 0 0 U0 U1 U2 U3 UX SX UW SW UR SR - - PRESENT *must* be in the bottom three bits because swap cache - entries use the top 29 bits. + - PRESENT *must* be in the bottom two bits because swap PTEs use + the top 30 bits. */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 879e9a6e5a87..287e25864ffa 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -276,22 +276,40 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <-------------------------- offset ---------------------------- + * + * 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 + * 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 + * --------------> <----------- zero ------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define MAX_SWAPFILES_CHECK() do { \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ } while (0) #define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ +#define __swp_type(x) (((x).val >> 2) \ & ((1UL << SWP_TYPE_BITS) - 1)) #define __swp_offset(x) ((x).val >> PTE_RPN_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ + (((type) & 0x1f) << 2) \ | ((offset) << PTE_RPN_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) +/* We borrow MSB 56 (LSB 7) to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x80 + int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); extern int __meminit vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 69c3a050a3d8..5f4620940c2c 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -151,6 +151,22 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Insert a PTE, top-level function is out of line. It uses an inline * low level function in the respective pgtable-* files */ diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 0934e8965e4e..d8924cbd61e4 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -12,7 +12,6 @@ /* Architected bits */ #define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ #define _PAGE_SW1 0x000002 -#define _PAGE_BIT_SWAP_TYPE 2 #define _PAGE_BAP_SR 0x000004 #define _PAGE_BAP_UR 0x000008 #define _PAGE_BAP_SW 0x000010 From 51a1007d4113c632ec5229c685e2162b72d9746d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:19 +0100 Subject: [PATCH 224/505] riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the offset. This reduces the maximum swap space per file: on 32bit to 16 GiB (was 32 GiB). Note that this bit does not conflict with swap PMDs and could also be used in swap PMD context later. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com Signed-off-by: David Hildenbrand Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable-bits.h | 3 +++ arch/riscv/include/asm/pgtable.h | 29 ++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index b9e13a8fe2b7..f896708e8331 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -27,6 +27,9 @@ */ #define _PAGE_PROT_NONE _PAGE_GLOBAL +/* Used for swap PTEs only. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + #define _PAGE_PFN_SHIFT 10 /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4eba9a98d0e3..03a4728db039 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -724,16 +724,18 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) * bit 1 to 3: _PAGE_LEAF (zero) * bit 5: _PAGE_PROT_NONE (zero) - * bits 6 to 10: swap type - * bits 10 to XLEN-1: swap offset + * bit 6: exclusive marker + * bits 7 to 11: swap type + * bits 11 to XLEN-1: swap offset */ -#define __SWP_TYPE_SHIFT 6 +#define __SWP_TYPE_SHIFT 7 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1UL << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) @@ -744,11 +746,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) \ - { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) + { (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(swp) __pmd((swp).val) From cca10df1029373cda5904887544ca6fcbbd2bac7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:20 +0100 Subject: [PATCH 225/505] sh/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 6 in the PTE, reducing the swap type in the !CONFIG_X2TLB case to 5 bits. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit is effectively unused. Interrestingly, the swap type in the !CONFIG_X2TLB case could currently overlap with the _PAGE_PRESENT bit, because there is a sneaky shift by 1 in __pte_to_swp_entry() and __swp_entry_to_pte(). Bit 0-7 in the architecture specific swap PTE would get shifted to bit 1-8 in the PTE. As generic MM uses 5 bits only, this didn't matter so far. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-21-david@redhat.com Signed-off-by: David Hildenbrand Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton --- arch/sh/include/asm/pgtable_32.h | 54 +++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index d0240decacca..c34aa795a9d2 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -423,40 +423,70 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #endif /* - * Encode and de-code a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Constraints: * _PAGE_PRESENT at bit 8 * _PAGE_PROTNONE at bit 9 * - * For the normal case, we encode the swap type into bits 0:7 and the - * swap offset into bits 10:30. For the 64-bit PTE case, we keep the - * preserved bits in the low 32-bits and use the upper 32 as the swap - * offset (along with a 5-bit type), following the same approach as x86 - * PAE. This keeps the logic quite simple. + * For the normal case, we encode the swap type and offset into the swap PTE + * such that bits 8 and 9 stay zero. For the 64-bit PTE case, we use the + * upper 32 for the swap offset and swap type, following the same approach as + * x86 PAE. This keeps the logic quite simple. * * As is evident by the Alpha code, if we ever get a 64-bit unsigned * long (swp_entry_t) to match up with the 64-bit PTEs, this all becomes * much cleaner.. - * - * NOTE: We should set ZEROs at the position of _PAGE_PRESENT - * and _PAGE_PROTNONE bits */ + #ifdef CONFIG_X2TLB +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------- offset ----------------------> < type -> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- zeroes --------------------> E 0 0 0 0 0 0 + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t){ (type) | (offset) << 5}) +#define __swp_entry(type, offset) ((swp_entry_t){ ((type) & 0x1f) | (offset) << 5}) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) #else -#define __swp_type(x) ((x).val & 0xff) +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> 0 0 0 0 E < type -> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 10) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) <<10}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & 0x1f) | (offset) << 10}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 1 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 1 }) #endif +/* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(low, swp_mkexclusive, |= _PAGE_SWP_EXCLUSIVE); +PTE_BIT_FUNC(low, swp_clear_exclusive, &= ~_PAGE_SWP_EXCLUSIVE); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SH_PGTABLE_32_H */ From e6b37d7f6f17db638275b6f8fd78714c047e3a57 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:21 +0100 Subject: [PATCH 226/505] sparc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 32bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by reusing the SRMMU_DIRTY bit as that seems to be safe to reuse inside a swap PTE. This avoids having to steal one bit from the swap offset. While at it, relocate the swap PTE layout documentation and use the same style now used for most other archs. Note that the old documentation was wrong: we use 20 bit for the offset and the reserved bits were 8 instead of 7 bits in the ascii art. Link: https://lkml.kernel.org/r/20230113171026.582290-22-david@redhat.com Signed-off-by: David Hildenbrand Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_32.h | 27 ++++++++++++++++++++++++++- arch/sparc/include/asm/pgtsrmmu.h | 14 +++----------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 5acc05b572e6..abf7a2601209 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -323,7 +323,16 @@ void srmmu_mapiorange(unsigned int bus, unsigned long xpa, unsigned long xva, unsigned int len); void srmmu_unmapiorange(unsigned long virt_addr, unsigned int len); -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> < type -> E 0 0 0 0 0 0 + */ static inline unsigned long __swp_type(swp_entry_t entry) { return (entry.val >> SRMMU_SWP_TYPE_SHIFT) & SRMMU_SWP_TYPE_MASK; @@ -344,6 +353,22 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | SRMMU_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~SRMMU_SWP_EXCLUSIVE); +} + static inline unsigned long __get_phys (unsigned long addr) { diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 6067925972d9..18e68d43f036 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -53,21 +53,13 @@ #define SRMMU_CHG_MASK (0xffffff00 | SRMMU_REF | SRMMU_DIRTY) -/* SRMMU swap entry encoding - * - * We use 5 bits for the type and 19 for the offset. This gives us - * 32 swapfiles of 4GB each. Encoding looks like: - * - * oooooooooooooooooootttttRRRRRRRR - * fedcba9876543210fedcba9876543210 - * - * The bottom 7 bits are reserved for protection and status bits, especially - * PRESENT. - */ +/* SRMMU swap entry encoding */ #define SRMMU_SWP_TYPE_MASK 0x1f #define SRMMU_SWP_TYPE_SHIFT 7 #define SRMMU_SWP_OFF_MASK 0xfffff #define SRMMU_SWP_OFF_SHIFT (SRMMU_SWP_TYPE_SHIFT + 5) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define SRMMU_SWP_EXCLUSIVE SRMMU_DIRTY /* Some day I will implement true fine grained access bits for * user pages because the SRMMU gives us the capabilities to From adf8e329ff56a873c789f797a69a330fb8c4ff9e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:22 +0100 Subject: [PATCH 227/505] sparc/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE on 64bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the type. Generic MM currently only uses 5 bits for the type (MAX_SWAPFILES_SHIFT), so the stolen bit was effectively unused. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-23-david@redhat.com Signed-off-by: David Hildenbrand Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_64.h | 38 ++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 3bc9736bddb1..a1658eebd036 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -187,6 +187,9 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V +/* We borrow bit 20 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _AC(0x0000000000100000, UL) + #ifndef __ASSEMBLY__ pte_t mk_pte_io(unsigned long, pgprot_t, int, unsigned long); @@ -961,18 +964,47 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif -/* Encode and de-code a swap entry */ -#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------> E <-- type ---> <------- zeroes --------> + */ +#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0x7fUL) #define __swp_offset(entry) ((entry).val >> (PAGE_SHIFT + 8UL)) #define __swp_entry(type, offset) \ ( (swp_entry_t) \ { \ - (((long)(type) << PAGE_SHIFT) | \ + ((((long)(type) & 0x7fUL) << PAGE_SHIFT) | \ ((long)(offset) << (PAGE_SHIFT + 8UL))) \ } ) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + int page_in_phys_avail(unsigned long paddr); /* From e2858d778e6832d8f58dc3b361f78bd9f03cd2dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:23 +0100 Subject: [PATCH 228/505] um/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 10, which is yet unused for swap PTEs. The pte_mkuptodate() is a bit weird in __pte_to_swp_entry() for a swap PTE ... but it only messes with bit 1 and 2 and there is a comment in set_pte(), so leave these bits alone. While at it, mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-24-david@redhat.com Signed-off-by: David Hildenbrand Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Signed-off-by: Andrew Morton --- arch/um/include/asm/pgtable.h | 37 +++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 4e3052f2671a..cedc5fd451ce 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -21,6 +21,9 @@ #define _PAGE_PROTNONE 0x010 /* if the user mapped it with PROT_NONE; pte_present gives true */ +/* We borrow bit 10 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x400 + #ifdef CONFIG_3_LEVEL_PGTABLES #include #else @@ -288,16 +291,46 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); #define update_mmu_cache(vma,address,ptep) do {} while (0) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> E < type -> 0 0 0 1 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_NEWPAGE (bit 1) is always set to 1 in set_pte(). + */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 11) }) + ((swp_entry_t) { (((type) & 0x1f) << 5) | ((offset) << 11) }) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_set_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_clear_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ From 93c0eac40d4e3ce4d5a3c6e0dc74eceaf8f63e0d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:24 +0100 Subject: [PATCH 229/505] x86/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE also on 32bit Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE just like we already do on x86-64. After deciphering the PTE layout it becomes clear that there are still unused bits for 2-level and 3-level page tables that we should be able to use. Reusing a bit avoids stealing one bit from the swap offset. While at it, mask the type in __swp_entry(); use some helper definitions to make the macros easier to grasp. Link: https://lkml.kernel.org/r/20230113171026.582290-25-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable-2level.h | 26 +++++++++++++++++++++----- arch/x86/include/asm/pgtable-3level.h | 26 +++++++++++++++++++++++--- arch/x86/include/asm/pgtable.h | 2 -- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 60d0f9015317..e9482a11ac52 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> 0 E <- type --> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) +#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ - & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \ + & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ + (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \ | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + /* No inverted PFNs on 2 level page tables */ static inline u64 protnone_mask(u64 val) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 967b135fa2c0..9e7c0b719c3c 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -145,8 +145,24 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * < type -> <---------------------- offset ---------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------------------------------> 0 E 0 0 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) @@ -154,9 +170,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_TYPE_BITS) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \ + | (offset) << SWP_TYPE_BITS}) /* * Normally, __swp_entry() converts from arch-independent swp_entry_t to @@ -184,6 +201,9 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + #include #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c843395a8b3..d25195726b78 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1299,7 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#ifdef _PAGE_SWP_EXCLUSIVE #define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { @@ -1315,7 +1314,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); } -#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) From f5c3fe300c5b40ff9af5ce2c9dd9897e91ce5735 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:25 +0100 Subject: [PATCH 230/505] xtensa/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by using bit 1. This bit should be safe to use for our usecase. Most importantly, we can still distinguish swap PTEs from PAGE_NONE PTEs (see pte_present()) and don't use one of the two reserved attribute masks (1101 and 1111). Attribute mask 1100 and 1110 now identify swap PTEs. While at it, remove SWP_TYPE_BITS (not really helpful as it's not used in the actual swap macros) and mask the type in __swp_entry(). Link: https://lkml.kernel.org/r/20230113171026.582290-26-david@redhat.com Signed-off-by: David Hildenbrand Cc: Chris Zankel Cc: Max Filippov Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/pgtable.h | 32 ++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 5b5484d707b2..1025e2dc292b 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -96,7 +96,7 @@ * +- - - - - - - - - - - - - - - - - - - - -+ * (PAGE_NONE)| PPN | 0 | 00 | ADW | 01 | 11 | 11 | * +-----------------------------------------+ - * swap | index | type | 01 | 11 | 00 | + * swap | index | type | 01 | 11 | e0 | * +-----------------------------------------+ * * For T1050 hardware and earlier the layout differs for present and (PAGE_NONE) @@ -112,6 +112,7 @@ * RI ring (0=privileged, 1=user, 2 and 3 are unused) * CA cache attribute: 00 bypass, 01 writeback, 10 writethrough * (11 is invalid and used to mark pages that are not present) + * e exclusive marker in swap PTEs * w page is writable (hw) * x page is executable (hw) * index swap offset / PAGE_SIZE (bit 11-31: 21 bits -> 8 GB) @@ -158,6 +159,9 @@ #define _PAGE_DIRTY (1<<7) /* software: page dirty */ #define _PAGE_ACCESSED (1<<8) /* software: page accessed (read) */ +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<1) + #ifdef CONFIG_MMU #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -343,19 +347,37 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) } /* - * Encode and decode a swap and file entry. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). */ -#define SWP_TYPE_BITS 5 -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(entry) (((entry).val >> 6) & 0x1f) #define __swp_offset(entry) ((entry).val >> 11) #define __swp_entry(type,offs) \ - ((swp_entry_t){((type) << 6) | ((offs) << 11) | \ + ((swp_entry_t){(((type) & 0x1f) << 6) | ((offs) << 11) | \ _PAGE_CA_INVALID | _PAGE_USER}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !defined (__ASSEMBLY__) */ From 950fe885a89770619e315f9b46301eebf0aab7b3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Jan 2023 18:10:26 +0100 Subject: [PATCH 231/505] mm: remove __HAVE_ARCH_PTE_SWP_EXCLUSIVE __HAVE_ARCH_PTE_SWP_EXCLUSIVE is now supported by all architectures that support swp PTEs, so let's drop it. Link: https://lkml.kernel.org/r/20230113171026.582290-27-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 1 - arch/arc/include/asm/pgtable-bits-arcv2.h | 1 - arch/arm/include/asm/pgtable.h | 1 - arch/arm64/include/asm/pgtable.h | 1 - arch/csky/include/asm/pgtable.h | 1 - arch/hexagon/include/asm/pgtable.h | 1 - arch/ia64/include/asm/pgtable.h | 1 - arch/loongarch/include/asm/pgtable.h | 1 - arch/m68k/include/asm/mcf_pgtable.h | 1 - arch/m68k/include/asm/motorola_pgtable.h | 1 - arch/m68k/include/asm/sun3_pgtable.h | 1 - arch/microblaze/include/asm/pgtable.h | 1 - arch/mips/include/asm/pgtable.h | 1 - arch/nios2/include/asm/pgtable.h | 1 - arch/openrisc/include/asm/pgtable.h | 1 - arch/parisc/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/32/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/nohash/pgtable.h | 1 - arch/riscv/include/asm/pgtable.h | 1 - arch/s390/include/asm/pgtable.h | 1 - arch/sh/include/asm/pgtable_32.h | 1 - arch/sparc/include/asm/pgtable_32.h | 1 - arch/sparc/include/asm/pgtable_64.h | 1 - arch/um/include/asm/pgtable.h | 1 - arch/x86/include/asm/pgtable.h | 1 - arch/xtensa/include/asm/pgtable.h | 1 - include/linux/pgtable.h | 29 -------------------- mm/debug_vm_pgtable.c | 2 -- mm/memory.c | 4 --- mm/rmap.c | 11 -------- 31 files changed, 73 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 970abf511b13..ba43cb841d19 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -328,7 +328,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 611f412713b9..6e9f8ca6d6a1 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -132,7 +132,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 886c275995a2..2e626e6da9a3 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -298,7 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) __pte((swp).val) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 65e78999c75d..575c63de894f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -417,7 +417,6 @@ static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 574c97b9ecca..d4042495febc 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,7 +200,6 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 7eb008e477c8..59393613d086 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -397,7 +397,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) (((type & 0x1f) << 1) | \ ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index e4b8ab931399..21c97e31a28a 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -424,7 +424,6 @@ extern void paging_init (void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index c6b8fe7ac43c..d28fb9dbec59 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -276,7 +276,6 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index e573d7b649f7..13741c1245e1 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -275,7 +275,6 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index c1782563e793..ec0dc19ab834 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -190,7 +190,6 @@ extern pgd_t kernel_pg_dir[128]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index dbfc9703b15d..e582b0484a55 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -174,7 +174,6 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 7e3de54bf426..d1b8272abcd9 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -412,7 +412,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 711874cee8e4..791389bf3c12 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -528,7 +528,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) static inline int pte_swp_exclusive(pte_t pte) { diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 05999da01731..0f5c2564e9f5 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -253,7 +253,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 903b32d662ab..3eb9b9555d0d 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -408,7 +408,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 3033bb88df34..e2950f5db7c9 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -422,7 +422,6 @@ extern void paging_init (void); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 0ecb3a58f23f..7bf1fe7297c6 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -386,7 +386,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb4c67bf45d7..4acc9690f599 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -717,7 +717,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 5f4620940c2c..a6caaaab6f92 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -151,7 +151,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 03a4728db039..5b9f409a940d 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -752,7 +752,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b26cbf1c533c..2b5db99e31dd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -812,7 +812,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index c34aa795a9d2..21952b094650 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -479,7 +479,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) /* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE _PAGE_USER -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte.pte_low & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index abf7a2601209..d4330e3c57a6 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -353,7 +353,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index a1658eebd036..2dc8d4641734 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -989,7 +989,6 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index cedc5fd451ce..a70d1618eb35 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -313,7 +313,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d25195726b78..7425f32e5293 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1299,7 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 1025e2dc292b..fc7a14884c6c 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -360,7 +360,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1159b25b0542..5fd45454c073 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1064,35 +1064,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif -/* - * When replacing an anonymous page by a real (!non) swap entry, we clear - * PG_anon_exclusive from the page and instead remember whether the flag was - * set in the swp pte. During fork(), we have to mark the entry as !exclusive - * (possibly shared). On swapin, we use that information to restore - * PG_anon_exclusive, which is very helpful in cases where we might have - * additional (e.g., FOLL_GET) references on a page and wouldn't be able to - * detect exclusivity. - * - * These functions don't apply to non-swap entries (e.g., migration, hwpoison, - * ...). - */ -#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE -static inline pte_t pte_swp_mkexclusive(pte_t pte) -{ - return pte; -} - -static inline int pte_swp_exclusive(pte_t pte) -{ - return false; -} - -static inline pte_t pte_swp_clear_exclusive(pte_t pte) -{ - return pte; -} -#endif - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ff8d6f6af896..af59cc7bd307 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -810,7 +810,6 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { -#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE unsigned long max_swap_offset; swp_entry_t entry, entry2; pte_t pte; @@ -841,7 +840,6 @@ static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) WARN_ON(!is_swap_pte(pte)); entry2 = pte_to_swp_entry(pte); WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); -#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ } static void __init pte_swap_tests(struct pgtable_debug_args *args) diff --git a/mm/memory.c b/mm/memory.c index c6bacd58d032..87b33b4967c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3864,10 +3864,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { - /* - * Note that pte_swp_exclusive() == false for architectures - * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. - */ exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* diff --git a/mm/rmap.c b/mm/rmap.c index 073999f78adf..0d07c500fc86 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1710,17 +1710,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } - /* - * Note: We *don't* remember if the page was mapped - * exclusively in the swap pte if the architecture - * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In - * that case, swapin code has to re-determine that - * manually and might detect the page as possibly - * shared, for example, if there are other references on - * the page or if the page is under writeback. We made - * sure that there are no GUP pins on the page that - * would rely on it, so for GUP pins this is fine. - */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) From 6189eb82f0aec8a877190bf52e629c687ed02773 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 13 Jan 2023 15:42:53 +0000 Subject: [PATCH 232/505] mm/page_ext: do not allocate space for page_ext->flags if not needed There is 8 byte page_ext->flags field allocated per page whenever CONFIG_PAGE_EXTENSION is enabled. However, not every user of page_ext uses flags. Therefore, check whether flags is needed at least by one user and if so allocate space for it. For example when page_table_check is enabled, on a machine with 128G of memory before the fix: [ 2.244288] allocated 536870912 bytes of page_ext after the fix: [ 2.160154] allocated 268435456 bytes of page_ext Also, add a kernel-doc comment before page_ext_operations that describes the fields, and remove check if need() is set, as that is now a required field. [pasha.tatashin@soleen.com: address comments from Mike Rapoport] Link: https://lkml.kernel.org/r/20230117202103.1412449-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20230113154253.92480-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Mike Rapoport (IBM) Cc: Charan Teja Kalla Cc: Li Zhe Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 18 ++++++++++++++++++ mm/page_ext.c | 14 ++++++++++++-- mm/page_owner.c | 1 + mm/page_table_check.c | 1 + 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 22be4582faae..67314f648aeb 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -7,15 +7,33 @@ #include struct pglist_data; + +/** + * struct page_ext_operations - per page_ext client operations + * @offset: Offset to the client's data within page_ext. Offset is returned to + * the client by page_ext_init. + * @size: The size of the client data within page_ext. + * @need: Function that returns true if client requires page_ext. + * @init: (optional) Called to initialize client once page_exts are allocated. + * @need_shared_flags: True when client is using shared page_ext->flags + * field. + * + * Each Page Extension client must define page_ext_operations in + * page_ext_ops array. + */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); + bool need_shared_flags; }; #ifdef CONFIG_PAGE_EXTENSION +/* + * The page_ext_flags users must set need_shared_flags to true. + */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, diff --git a/mm/page_ext.c b/mm/page_ext.c index 4ee522fd381c..e2c22ffdbb81 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -71,6 +71,7 @@ static bool need_page_idle(void) } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, + .need_shared_flags = true, }; #endif @@ -86,7 +87,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #endif }; -unsigned long page_ext_size = sizeof(struct page_ext); +unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); @@ -106,7 +107,16 @@ static bool __init invoke_need_callbacks(void) bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need_shared_flags) { + page_ext_size = sizeof(struct page_ext); + break; + } + } + } + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d27f532df4c..f0553bedb39d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -99,6 +99,7 @@ struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, + .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 93e633c1d587..25d8610c0042 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -45,6 +45,7 @@ struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, + .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) From 524c48072e5673f4511f1ad81493e2485863fd65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:12 +0000 Subject: [PATCH 233/505] mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE Patch series "Discard __GFP_ATOMIC", v3. Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm: discard __GFP_ATOMIC") for a long time and recently brought up again. Most recently, I was worried that __GFP_HIGH allocations could use high-order atomic reserves which is unintentional but there was no response so lets revisit -- this series reworks how min reserves are used, protects highorder reserves and then finishes with Neil's patch with very minor modifications so it fits on top. There was a review discussion on renaming __GFP_DIRECT_RECLAIM to __GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is orthogonal to the removal of __GFP_ATOMIC. There were some concerns about how the gfp flags affect the min reserves but it never reached a solid conclusion so I made my own attempt. The series tries to iron out some of the details on how reserves are used. ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes ALLOC_NON_BLOCK and documents how the reserves are affected. For example, ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min reserve. ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined allows deeper access again. ALLOC_OOM allows access to 75%. High-order atomic allocations are explicitly handled with the caveat that no __GFP_ATOMIC flag means that any high-order allocation that specifies GFP_HIGH and cannot enter direct reclaim will be treated as if it was GFP_ATOMIC. This patch (of 6): __GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it means. As ALLOC_HIGH is internal to the allocator, rename it to ALLOC_MIN_RESERVE to document that the min reserves can be depleted. Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 4 +++- mm/page_alloc.c | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 973b48e8b1af..99eb544fbded 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -779,7 +779,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #endif #define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83be3b571fd0..4c1f1d487c3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3994,7 +3994,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_HIGH) + if (alloc_flags & ALLOC_MIN_RESERVE) min -= min / 2; if (unlikely(alloc_harder)) { @@ -4836,18 +4836,18 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* - * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD * to save two branches. */ - BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); From c988dcbecf3fd5430921eaa3fe9054754f76d185 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:13 +0000 Subject: [PATCH 234/505] mm/page_alloc: treat RT tasks similar to __GFP_HIGH RT tasks are allowed to dip below the min reserve but ALLOC_HARDER is typically combined with ALLOC_MIN_RESERVE so RT tasks are a little unusual. While there is some justification for allowing RT tasks access to memory reserves, there is a strong chance that a RT task that is also under memory pressure is at risk of missing deadlines anyway. Relax how much reserves an RT task can access by treating it the same as __GFP_HIGH allocations. Note that in a future kernel release that the RT special casing will be removed. Hard realtime tasks should be locking down resources in advance and ensuring enough memory is available. Even a soft-realtime task like audio or video live decoding which cannot jitter should be allocating both memory and any disk space required up-front before the recording starts instead of relying on reserves. At best, reserve access will only delay the problem by a very short interval. Link: https://lkml.kernel.org/r/20230113111217.14134-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c1f1d487c3e..4f5c2e83cb7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4865,7 +4865,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_MIN_RESERVE; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); From eb2e2b425c6984ca8034448a3f2c680622bd3d4d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:14 +0000 Subject: [PATCH 235/505] mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags A high-order ALLOC_HARDER allocation is assumed to be atomic. While that is accurate, it changes later in the series. In preparation, explicitly record high-order atomic allocations in gfp_to_alloc_flags(). Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/page_alloc.c | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 99eb544fbded..62428467d7cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -789,6 +789,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ enum ttu_flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5c2e83cb7b..8a7e1cfa8353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3724,10 +3724,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) + if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* + * If the allocation fails, allow OOM handling access + * to HIGHATOMIC reserves as failing now is worse than + * failing a high-order atomic allocation in the + * future. + */ + if (!page && (alloc_flags & ALLOC_OOM)) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { spin_unlock_irqrestore(&zone->lock, flags); return NULL; @@ -4041,8 +4051,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && + !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; + } } return false; } @@ -4304,7 +4316,7 @@ try_this_zone: * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) reserve_highatomic_pageblock(page, zone, order); return page; @@ -4831,7 +4843,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, } static inline unsigned int -gfp_to_alloc_flags(gfp_t gfp_mask) +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; @@ -4857,8 +4869,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_HARDER; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + /* * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the * comment for __cpuset_node_allowed(). @@ -5066,7 +5083,7 @@ restart: * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + alloc_flags = gfp_to_alloc_flags(gfp_mask, order); /* * We need to recalculate the starting point for the zonelist iterator From ab3508854353793cd35e348fde89a5c09b2fd8b5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:15 +0000 Subject: [PATCH 236/505] mm/page_alloc: explicitly define what alloc flags deplete min reserves As there are more ALLOC_ flags that affect reserves, define what flags affect reserves and clarify the effect of each flag. Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 3 +++ mm/page_alloc.c | 36 +++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 62428467d7cf..f7c3bc80c475 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -792,6 +792,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a7e1cfa8353..ffe2151d11ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3967,15 +3967,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); long unusable_free = (1 << order) - 1; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * If the caller does not have rights to reserves below the min + * watermark then subtract the high-atomic reserves. This will + * over-estimate the size of the atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!(alloc_flags & ALLOC_RESERVES))) unusable_free += z->nr_reserved_highatomic; #ifdef CONFIG_CMA @@ -3999,25 +3998,36 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_MIN_RESERVE) - min -= min / 2; - - if (unlikely(alloc_harder)) { + if (unlikely(alloc_flags & ALLOC_RESERVES)) { /* - * OOM victims can try even harder than normal ALLOC_HARDER + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ + if (alloc_flags & ALLOC_MIN_RESERVE) + min -= min / 2; + + /* + * Non-blocking allocations can access some of the reserve + * with more access if also __GFP_HIGH. The reasoning is that + * a non-blocking caller may incur a more severe penalty + * if it cannot get memory quickly, particularly if it's + * also __GFP_HIGH. + */ + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + /* + * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; - else - min -= min / 4; } /* From 1ebbb21811b76c3b932959787f37985af36f62fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:16 +0000 Subject: [PATCH 237/505] mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague description. In preparation for the removal of GFP_ATOMIC redefine __GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to ALLOC_NON_BLOCK accordingly. __GFP_HIGH is required for access to reserves but non-blocking is granted more access. For example, GFP_NOWAIT is non-blocking but has no special access to reserves. A __GFP_NOFAIL blocking allocation is granted access similar to __GFP_HIGH if the only alternative is an OOM kill. Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++++-- mm/page_alloc.c | 44 ++++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f7c3bc80c475..b0b88a95347f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -778,7 +778,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ @@ -793,7 +796,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ -#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffe2151d11ee..18ca33a1945d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4007,18 +4007,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * __GFP_HIGH allows access to 50% of the min reserve as well * as OOM. */ - if (alloc_flags & ALLOC_MIN_RESERVE) + if (alloc_flags & ALLOC_MIN_RESERVE) { min -= min / 2; - /* - * Non-blocking allocations can access some of the reserve - * with more access if also __GFP_HIGH. The reasoning is that - * a non-blocking caller may incur a more severe penalty - * if it cannot get memory quickly, particularly if it's - * also __GFP_HIGH. - */ - if (alloc_flags & ALLOC_HARDER) - min -= min / 4; + /* + * Non-blocking allocations (e.g. GFP_ATOMIC) can + * access more reserves than just __GFP_HIGH. Other + * non-blocking allocations requests such as GFP_NOWAIT + * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get + * access to the min reserve. + */ + if (alloc_flags & ALLOC_NON_BLOCK) + min -= min / 4; + } /* * OOM victims can try even harder than the normal reserve @@ -4869,28 +4870,30 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). + * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_NON_BLOCK; if (order > 0) alloc_flags |= ALLOC_HIGHATOMIC; } /* - * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed(). + * Ignore cpuset mems for non-blocking __GFP_HIGH (probably + * GFP_ATOMIC) rather than fail, see the comment for + * __cpuset_node_allowed(). */ - alloc_flags &= ~ALLOC_CPUSET; + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_MIN_RESERVE; @@ -5321,12 +5324,13 @@ nopage: WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* - * Help non-failing allocations by giving them access to memory - * reserves but do not use ALLOC_NO_WATERMARKS because this + * Help non-failing allocations by giving some access to memory + * reserves normally used for high priority non-blocking + * allocations but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make - * the situation worse + * the situation worse. */ - page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); if (page) goto got_pg; From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2023 11:12:17 +0000 Subject: [PATCH 238/505] mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - allows __GFP_HIGH allocations to ignore watermark boosting as well as GFP_ATOMIC requests. - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. [mgorman: Minor adjustments to rework on top of a series] Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net Signed-off-by: NeilBrown Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Thierry Reding Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp_types.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 13 +++---------- tools/perf/builtin-kmem.c | 1 - 8 files changed, 15 insertions(+), 28 deletions(-) diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e173..e38e9d83c1c7 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 5b1af40221ec..af8d0e685260 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e1..5088637fe5c2 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c..9db52bc4ce19 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index d34dc636b81c..46b4e6c414a3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -674,17 +674,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index b0b88a95347f..2d09a7a0600a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18ca33a1945d..0cfad30fb44c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4105,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -5076,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 8ae0a1535293..f3029742b800 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -653,7 +653,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, From 2cf1338454a8a9a0b3c1271ccb521afa2d6ae241 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 13 Jan 2023 11:30:11 +0900 Subject: [PATCH 239/505] mm: fix khugepaged with shmem_enabled=advise Pass vm_flags as a parameter to shmem_is_huge, rather than reading the flags from the vm_area_struct in question. This allows the updated flags from hugepage_madvise to be passed to the check, which is necessary because madvise does not update the vm_area_struct's flags until after hugepage_madvise returns. This fixes an issue when shmem_enabled=madvise, where MADV_HUGEPAGE on shmem was not able to register the mm_struct with khugepaged. Prior to cd89fb065099, the mm_struct was registered by MADV_HUGEPAGE regardless of the value of shmem_enabled (which was only checked when scanning vmas). Link: https://lkml.kernel.org/r/20230113023011.1784015-1-stevensd@google.com Fixes: cd89fb065099 ("mm,thp,shmem: make khugepaged obey tmpfs mount flags") Signed-off-by: David Stevens Cc: David Stevens Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++-------- mm/huge_memory.c | 3 ++- mm/shmem.c | 18 +++++++++--------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d500ea967dc7..d09d54be4ffd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,14 +92,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma, - bool shmem_huge_force) -{ - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, - shmem_huge_force); -} +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 868fcccdff72..1d6977dc6b31 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma, !enforce_sysfs); + return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + !enforce_sysfs, vma->vm_mm, vm_flags); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && diff --git a/mm/shmem.c b/mm/shmem.c index c5048c6c83dd..9e1015cbad29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -468,15 +468,14 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; @@ -493,7 +492,7 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, return true; fallthrough; case SHMEM_HUGE_ADVISE: - if (vma && (vma->vm_flags & VM_HUGEPAGE)) + if (mm && (vm_flags & VM_HUGEPAGE)) return true; fallthrough; default: @@ -676,8 +675,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { return false; } @@ -1068,7 +1067,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0, false)) + if (shmem_is_huge(inode, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1926,7 +1925,8 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index, false)) + if (!shmem_is_huge(inode, index, false, + vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); From ee7a5906ff08e435ed95ec9fe7c7eed2c11015d2 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:26 -0800 Subject: [PATCH 240/505] pagemap: add filemap_grab_folio() Patch series "Convert to filemap_get_folios_tag()", v5. This patch series replaces find_get_pages_range_tag() with filemap_get_folios_tag(). This also allows the removal of multiple calls to compound_head() throughout. It also makes a good chunk of the straightforward conversions to folios, and takes the opportunity to introduce a function that grabs a folio from the pagecache. This patch (of 23): Add function filemap_grab_folio() to grab a folio from the page cache. This function is meant to serve as a folio replacement for grab_cache_page, and is used to facilitate the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230104211448.4804-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 29e1f9e76eb6..468183be67be 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -546,6 +546,26 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, return __filemap_get_folio(mapping, index, FGP_LOCK, 0); } +/** + * filemap_grab_folio - grab a folio from the page cache + * @mapping: The address space to search + * @index: The page index + * + * Looks up the page cache entry at @mapping & @index. If no folio is found, + * a new folio is created. The folio is locked, marked as accessed, and + * returned. + * + * Return: A found or created folio. NULL if no folio is found and failed to + * create a folio. + */ +static inline struct folio *filemap_grab_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping)); +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search From 247f9e1feef4e57911510c8f82348efb4491ea0e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:27 -0800 Subject: [PATCH 241/505] filemap: add filemap_get_folios_tag() This is the equivalent of find_get_pages_range_tag(), except for folios instead of pages. One noteable difference is filemap_get_folios_tag() does not take in a maximum pages argument. It instead tries to fill a folio batch and stops either once full (15 folios) or reaching the end of the search range. The new function supports large folios, the initial function did not since all callers don't use large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 54 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 468183be67be..bb3c1d51b1cb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -739,6 +739,8 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..291bb3e0957a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2281,6 +2281,60 @@ out: } EXPORT_SYMBOL(filemap_get_folios_contig); +/** + * filemap_get_folios_tag - Get a batch of folios matching @tag + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @tag: The tag index + * @fbatch: The batch to fill + * + * Same as filemap_get_folios(), but only returning folios tagged with @tag. + * + * Return: The number of folios found. + * Also update @start to index the next folio for traversal. + */ +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, tag)) != NULL) { + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(folio)) + continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios_tag); + /** * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search From 6817ef514e1aacee228f6a9fbcdc3a2c49cb6c29 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:28 -0800 Subject: [PATCH 242/505] filemap: convert __filemap_fdatawait_range() to use filemap_get_folios_tag() Convert function to use folios. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 2 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 291bb3e0957a..85adbcf2d9a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -503,25 +503,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned nr_folios; + + folio_batch_init(&fbatch); - pagevec_init(&pvec); while (index <= end) { unsigned i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK); - if (!nr_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + PAGECACHE_TAG_WRITEBACK, &fbatch); + + if (!nr_folios) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - wait_on_page_writeback(page); - ClearPageError(page); + folio_wait_writeback(folio); + folio_clear_error(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } From 0fff435f060c8b29cb068d4068cb2df513046865 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:29 -0800 Subject: [PATCH 243/505] page-writeback: convert write_cache_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 8 calls to compound_head(), and the function now supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcow (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e91f94b3438b..5e892f20bed7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2398,15 +2398,15 @@ int write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int error; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; @@ -2426,17 +2426,18 @@ int write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index; + done_index = folio->index; - lock_page(page); + folio_lock(folio); /* * Page truncated or invalidated. We can freely skip it @@ -2446,30 +2447,30 @@ int write_cache_pages(struct address_space *mapping, * even if there is now a new, dirty page at the same * pagecache address. */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = (*writepage)(page, wbc, data); + error = writepage(&folio->page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of @@ -2484,11 +2485,12 @@ continue_unlock: * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } @@ -2508,7 +2510,7 @@ continue_unlock: break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } From acc8d8588cb7e3e64b0d2fa611dad06574cd67b1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:30 -0800 Subject: [PATCH 244/505] afs: convert afs_writepages_region() to use filemap_get_folios_tag() Convert to use folios throughout. This function is in preparation to remove find_get_pages_range_tag(). Also modify this function to write the whole batch one at a time, rather than calling for a new set every single write. Link: https://lkml.kernel.org/r/20230104211448.4804-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Tested-by: David Howells Signed-off-by: Andrew Morton --- fs/afs/write.c | 124 +++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323..2d3b08b7406c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -704,85 +704,87 @@ static int afs_writepages_region(struct address_space *mapping, bool max_one_loop) { struct folio *folio; - struct page *head_page; + struct folio_batch fbatch; ssize_t ret; + unsigned int i; int n, skips = 0; _enter("%llx,%llx,", start, end); + folio_batch_init(&fbatch); do { pgoff_t index = start / PAGE_SIZE; - n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &head_page); + n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!n) break; + for (i = 0; i < n; i++) { + folio = fbatch.folios[i]; + start = folio_pos(folio); /* May regress with THPs */ - folio = page_folio(head_page); - start = folio_pos(folio); /* May regress with THPs */ + _debug("wback %lx", folio_index(folio)); - _debug("wback %lx", folio_index(folio)); + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_batch_release(&fbatch); + return ret; + } + } else { + if (!folio_trylock(folio)) + continue; + } - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + continue; + } + + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + } else { + start += folio_size(folio); + } + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + *_next = start; + _leave(" = 0 [%llx]", *_next); + return 0; + } + skips++; + } + continue; + } + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + ret = afs_write_back_from_locked_folio(mapping, wbc, + folio, start, end); if (ret < 0) { - folio_put(folio); + _leave(" = %zd", ret); + folio_batch_release(&fbatch); return ret; } - } else { - if (!folio_trylock(folio)) { - folio_put(folio); - return 0; - } + + start += ret; } - if (folio_mapping(folio) != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - folio_put(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - } else { - start += folio_size(folio); - } - folio_put(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) - break; - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); - folio_put(folio); - if (ret < 0) { - _leave(" = %zd", ret); - return ret; - } - - start += ret; - - if (max_one_loop) - break; - + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); From 51c5cd3bafe5e1e8a678d661c43b09d7c6584274 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:31 -0800 Subject: [PATCH 245/505] btrfs: convert btree_write_cache_pages() to use filemap_get_folio_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9bd32daa9b9a..d5ef288d3a43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2865,14 +2865,14 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2895,14 +2895,15 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, + &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2917,7 +2918,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { From 9f50fd2e92e37441da3a1daa8e27fd0c400b6cdd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:32 -0800 Subject: [PATCH 246/505] btrfs: convert extent_write_cache_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Now also supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d5ef288d3a43..0a2d6fb611c6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2993,8 +2993,8 @@ static int extent_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -3014,7 +3014,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3052,14 +3052,14 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, - &index, end, tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, + end, tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index + 1; + done_index = folio->index + folio_nr_pages(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -3067,29 +3067,29 @@ retry: * or even swizzled back from swapper_space to * tmpfs file mapping */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); - lock_page(page); + folio_lock(folio); } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) + if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); - wait_on_page_writeback(page); + folio_wait_writeback(folio); } - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); + if (folio_test_writeback(folio) || + !folio_clear_dirty_for_io(folio)) { + folio_unlock(folio); continue; } - ret = __extent_writepage(page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3102,7 +3102,7 @@ retry: */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { From 590a2b5f0a9b740e415e0d52bd8a0f87fc15b87b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:33 -0800 Subject: [PATCH 247/505] ceph: convert ceph_writepages_start() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Also some minor renaming for consistency. Link: https://lkml.kernel.org/r/20230104211448.4804-9-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Jeff Layton Signed-off-by: Andrew Morton --- fs/ceph/addr.c | 58 ++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..905268bf9741 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -792,7 +792,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct pagevec pvec; + struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; @@ -821,7 +821,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec); + folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -869,7 +869,7 @@ retry: while (!done && index <= end) { int num_ops = 0, op_idx; - unsigned i, pvec_pages, max_pages, locked_pages = 0; + unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; @@ -879,13 +879,13 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY); - dout("pagevec_lookup_range_tag got %d\n", pvec_pages); - if (!pvec_pages && !locked_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, + end, PAGECACHE_TAG_DIRTY, &fbatch); + dout("pagevec_lookup_range_tag got %d\n", nr_folios); + if (!nr_folios && !locked_pages) break; - for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { - page = pvec.pages[i]; + for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { + page = &fbatch.folios[i]->page; dout("? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -995,7 +995,7 @@ get_more_pages: len = 0; } - /* note position of first page in pvec */ + /* note position of first page in fbatch */ dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -1005,30 +1005,30 @@ get_more_pages: fsc->write_congested = true; pages[locked_pages++] = page; - pvec.pages[i] = NULL; + fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) - goto release_pvec_pages; + goto release_folios; if (i) { unsigned j, n = 0; - /* shift unused page to beginning of pvec */ - for (j = 0; j < pvec_pages; j++) { - if (!pvec.pages[j]) + /* shift unused page to beginning of fbatch */ + for (j = 0; j < nr_folios; j++) { + if (!fbatch.folios[j]) continue; if (n < j) - pvec.pages[n] = pvec.pages[j]; + fbatch.folios[n] = fbatch.folios[j]; n++; } - pvec.nr = n; + fbatch.nr = n; - if (pvec_pages && i == pvec_pages && + if (nr_folios && i == nr_folios && locked_pages < max_pages) { - dout("reached end pvec, trying for more\n"); - pagevec_release(&pvec); + dout("reached end fbatch, trying for more\n"); + folio_batch_release(&fbatch); goto get_more_pages; } } @@ -1164,10 +1164,10 @@ new_request: if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; -release_pvec_pages: - dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, - pvec.nr ? pvec.pages[0] : NULL); - pagevec_release(&pvec); +release_folios: + dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr, + fbatch.nr ? fbatch.folios[0] : NULL); + folio_batch_release(&fbatch); } if (should_loop && !done) { @@ -1184,15 +1184,17 @@ release_pvec_pages: unsigned i, nr; index = 0; while ((index <= end) && - (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK))) { + (nr = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &fbatch))) { for (i = 0; i < nr; i++) { - page = pvec.pages[i]; + page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } From 4cda80f3a7a53a0bc66cd9f16f7872524cfdd87d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:34 -0800 Subject: [PATCH 248/505] cifs: convert wdata_alloc_and_fillpages() to use filemap_get_folios_tag() This is in preparation for the removal of find_get_pages_range_tag(). Now also supports the use of large folios. Since tofind might be larger than the max number of folios in a folio_batch (15), we loop through filling in wdata->pages pulling more batches until we either reach tofind pages or run out of folios. This function may not return all pages in the last found folio before tofind pages are reached. Link: https://lkml.kernel.org/r/20230104211448.4804-10-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Paulo Alcantara (SUSE) Cc: Tom Talpey Signed-off-by: Andrew Morton --- fs/cifs/file.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..8cdd2f67af24 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2527,14 +2527,40 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, unsigned int *found_pages) { struct cifs_writedata *wdata; - + struct folio_batch fbatch; + unsigned int i, idx, p, nr; wdata = cifs_writedata_alloc((unsigned int)tofind, cifs_writev_complete); if (!wdata) return NULL; - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); + folio_batch_init(&fbatch); + *found_pages = 0; + +again: + nr = filemap_get_folios_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!nr) + goto out; /* No dirty pages left in the range */ + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + wdata->pages[*found_pages] = folio_page(folio, idx); + folio_get(folio); + if (++*found_pages == tofind) { + folio_batch_release(&fbatch); + goto out; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +out: return wdata; } From 50ead2537441f7df8d493e1085da76034ea92cf1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:35 -0800 Subject: [PATCH 249/505] ext4: convert mpage_prepare_extent_to_map() to use filemap_get_folios_tag() Convert the function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). Now supports large folios. This change removes 11 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-11-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 65 ++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d9f414f99fe..fb6cd994e59a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2595,8 +2595,8 @@ static bool ext4_page_nomap_can_writeout(struct page *page) static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct pagevec pvec; - unsigned int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; @@ -2610,18 +2610,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - - pagevec_init(&pvec); + folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply @@ -2635,10 +2634,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != page->index) + if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; - lock_page(page); + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which @@ -2646,16 +2645,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ - if (!PageDirty(page) || - (PageWriteback(page) && + if (!folio_test_dirty(folio) || + (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || - unlikely(page->mapping != mapping)) { - unlock_page(page); + unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); - BUG_ON(PageWriteback(page)); + folio_wait_writeback(folio); + BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in @@ -2666,49 +2665,49 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ - if (!page_has_buffers(page)) { - ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); + if (!folio_buffers(folio)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); + folio_clear_dirty(folio); + folio_unlock(folio); continue; } if (mpd->map.m_len == 0) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; + mpd->first_page = folio->index; + mpd->next_page = folio->index + folio_nr_pages(folio); /* * Writeout for transaction commit where we cannot * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(page)) { - err = mpage_submit_page(mpd, page); + if (ext4_page_nomap_can_writeout(&folio->page)) { + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } else { - unlock_page(page); - mpd->first_page++; + folio_unlock(folio); + mpd->first_page += folio_nr_pages(folio); } } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << + lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); - head = page_buffers(page); + head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, - lblk); + lblk); if (err <= 0) goto out; err = 0; } - left--; + left -= folio_nr_pages(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } From e6e46e1eb7cea179b9b31a62a0bbac6ba24bd050 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:36 -0800 Subject: [PATCH 250/505] f2fs: convert f2fs_fsync_node_pages() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-12-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dde4c0458704..3e0362794e27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1731,12 +1731,12 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nr_pages; + int nr_folios; int nwritten = 0; if (atomic) { @@ -1745,20 +1745,21 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); ret = -EIO; goto out; } @@ -1824,7 +1825,7 @@ continue_unlock: break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (ret || marked) From a40a4ad1186a37671070786b8143b16377899b5d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:37 -0800 Subject: [PATCH 251/505] f2fs: convert f2fs_flush_inline_data() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-13-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3e0362794e27..1c5dc7a3207e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1890,17 +1890,18 @@ static bool flush_dirty_inode(struct page *page) void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) { pgoff_t index = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while ((nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (!IS_DNODE(page)) continue; @@ -1927,7 +1928,7 @@ continue_unlock: } unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } From 7525486affa518c9e7ffc9b9dbc966021041ebde Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:38 -0800 Subject: [PATCH 252/505] f2fs: convert f2fs_sync_node_pages() to use filemap_get_folios_tag() Convert function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-14-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1c5dc7a3207e..51e9f286f53a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1938,23 +1938,24 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, bool do_balance, enum iostat_type io_type) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int step = 0; int nwritten = 0; int ret = 0; - int nr_pages, done = 0; + int nr_folios, done = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); next_step: index = 0; - while (!done && (nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), + &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2029,7 +2030,7 @@ write_node: if (--wbc->nr_to_write == 0) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (wbc->nr_to_write == 0) { From 1cd98ee747cff120ee9b93988ddb7315d8d8f8e7 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:39 -0800 Subject: [PATCH 253/505] f2fs: convert f2fs_write_cache_pages() to use filemap_get_folios_tag() Convert the function to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Also modified f2fs_all_cluster_page_ready to take in a folio_batch instead of pagevec. This does NOT support large folios. The function currently only utilizes folios of size 1 so this shouldn't cause any issues right now. This version of the patch limits the number of pages fetched to F2FS_ONSTACK_PAGES. If that ever happens, update the start index here since filemap_get_folios_tag() updates the index to be after the last found folio, not necessarily the last used page. Link: https://lkml.kernel.org/r/20230104211448.4804-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/data.c | 84 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 97e816590cd9..b02c5b384204 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2957,6 +2957,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0, retry = 0; struct page *pages[F2FS_ONSTACK_PAGES]; + struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2977,6 +2978,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .private = NULL, }; #endif + int nr_folios, p, idx; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ @@ -2987,6 +2989,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; + folio_batch_init(&fbatch); + if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -3012,13 +3016,38 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = find_get_pages_range_tag(mapping, &index, end, - tag, F2FS_ONSTACK_PAGES, pages); - if (nr_pages == 0) + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; break; + } + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == F2FS_ONSTACK_PAGES) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; @@ -3035,7 +3064,7 @@ readd: } if (!f2fs_cluster_can_merge_page(&cc, - page->index)) { + folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) @@ -3044,27 +3073,28 @@ readd: } if (unlikely(f2fs_cp_error(sbi))) - goto lock_page; + goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) - goto lock_page; + goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) - goto lock_page; + goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, - page->index, &fsdata); + folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, - fsdata, page->index, 1) || + fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, - pages, i, nr_pages, false))) { + pages, i, nr_pages, + false))) { retry = 1; break; } @@ -3077,46 +3107,47 @@ readd: break; } #ifdef CONFIG_F2FS_FS_COMPRESSION -lock_page: +lock_folio: #endif - done_index = page->index; + done_index = folio->index; retry_write: - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, + f2fs_wait_on_page_writeback( + &folio->page, DATA, true, true); else goto continue_unlock; } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - get_page(page); - f2fs_compress_ctx_add_page(&cc, page); + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, &folio->page); continue; } #endif - ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, - 0, true); + ret = f2fs_write_single_data_page(&folio->page, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif @@ -3140,7 +3171,8 @@ result: } goto next; } - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } From 4f4a4f0febe6009a4cdc8acac52cc5dc980f185c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:40 -0800 Subject: [PATCH 254/505] f2fs: convert last_fsync_dnode() to use filemap_get_folios_tag() Convert to use a folio_batch instead of pagevec. This is in preparation for the removal of find_get_pages_range_tag(). Link: https://lkml.kernel.org/r/20230104211448.4804-16-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/node.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51e9f286f53a..cf997356d9f9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1518,23 +1518,24 @@ iput_out: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; struct page *last_page = NULL; - int nr_pages; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); return ERR_PTR(-EIO); } @@ -1565,7 +1566,7 @@ continue_unlock: last_page = page; unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } return last_page; From 580e7a4926089ea735fa09d42030d90e21537f7f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:41 -0800 Subject: [PATCH 255/505] f2fs: convert f2fs_sync_meta_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 5 calls to compound_head(). Initially the function was checking if the previous page index is truly the previous page i.e. 1 index behind the current page. To convert to folios and maintain this check we need to make the check folio->index != prev + folio_nr_pages(previous folio) since we don't know how many pages are in a folio. At index i == 0 the check is guaranteed to succeed, so to workaround indexing bounds we can simply ignore the check for that specific index. This makes the initial assignment of prev trivial, so I removed that as well. Also modify a comment in commit_checkpoint for consistency. Link: https://lkml.kernel.org/r/20230104211448.4804-17-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Chao Yu Signed-off-by: Andrew Morton --- fs/f2fs/checkpoint.c | 49 +++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 56f7d0d6a8b2..5a5515d83a1b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -395,59 +395,62 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; - struct pagevec pvec; + struct folio_batch fbatch; long nwritten = 0; - int nr_pages; + int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec); + folio_batch_init(&fbatch); blk_start_plug(&plug); - while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - if (prev == ULONG_MAX) - prev = page->index - 1; - if (nr_to_write != LONG_MAX && page->index != prev + 1) { - pagevec_release(&pvec); + if (nr_to_write != LONG_MAX && i != 0 && + folio->index != prev + + folio_nr_pages(fbatch.folios[i-1])) { + folio_batch_release(&fbatch); goto stop; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true, true); + f2fs_wait_on_page_writeback(&folio->page, META, + true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - if (__f2fs_write_meta_page(page, &wbc, io_type)) { - unlock_page(page); + if (__f2fs_write_meta_page(&folio->page, &wbc, + io_type)) { + folio_unlock(folio); break; } - nwritten++; - prev = page->index; + nwritten += folio_nr_pages(folio); + prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } stop: @@ -1403,7 +1406,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, }; /* - * pagevec_lookup_tag and lock_page again will take + * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ From 87ed37e66dfd08f6d692969cbd39282a359a2f7d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:42 -0800 Subject: [PATCH 256/505] gfs2: convert gfs2_write_cache_jdata() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pgaes_range_tag(). This change removes 8 calls to compound_head(). Also had to modify and rename gfs2_write_jdata_pagevec() to take in and utilize folio_batch rather than pagevec and use folios rather than pages. gfs2_write_jdata_batch() now supports large folios. Link: https://lkml.kernel.org/r/20230104211448.4804-18-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Andreas Gruenbacher Signed-off-by: Andrew Morton --- fs/gfs2/aops.c | 64 +++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e782b4f1d104..0a47068f9acc 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -195,67 +195,71 @@ static int gfs2_writepages(struct address_space *mapping, } /** - * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * gfs2_write_jdata_batch - Write back a folio batch's worth of folios * @mapping: The mapping * @wbc: The writeback control - * @pvec: The vector of pages - * @nr_pages: The number of pages to write + * @fbatch: The batch of folios * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ -static int gfs2_write_jdata_pagevec(struct address_space *mapping, +static int gfs2_write_jdata_batch(struct address_space *mapping, struct writeback_control *wbc, - struct pagevec *pvec, - int nr_pages, + struct folio_batch *fbatch, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + unsigned nrblocks; int i; int ret; + int nr_pages = 0; + int nr_folios = folio_batch_count(fbatch); + + for (i = 0; i < nr_folios; i++) + nr_pages += folio_nr_pages(fbatch->folios[i]); + nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; - for(i = 0; i < nr_pages; i++) { - struct page *page = pvec->pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch->folios[i]; - *done_index = page->index; + *done_index = folio->index; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(page, wbc); + ret = __gfs2_jdata_writepage(&folio->page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); ret = 0; } else { @@ -268,7 +272,8 @@ continue_unlock: * not be suitable for data integrity * writeout). */ - *done_index = page->index + 1; + *done_index = folio->index + + folio_nr_pages(folio); ret = 1; break; } @@ -305,8 +310,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, { int ret = 0; int done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t writeback_index; pgoff_t index; pgoff_t end; @@ -315,7 +320,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -341,17 +346,18 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index); + ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, + &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } From 5ee4b25cb7302ae2c62fab7adc1529d9f497bc6d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:43 -0800 Subject: [PATCH 257/505] nilfs2: convert nilfs_lookup_dirty_data_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 4 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-19-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f7a14ed12a66..c17db9a0c665 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -680,7 +680,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0, last = ULONG_MAX; size_t ndirties = 0; int i; @@ -694,23 +694,26 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: if (unlikely(index > last) || - !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY)) + !filemap_get_folios_tag(mapping, &index, last, + PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties; - for (i = 0; i < pagevec_count(&pvec); i++) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, i_blocksize(inode), 0); - unlock_page(page); + folio_lock(folio); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); + } + folio_unlock(folio); - bh = head = page_buffers(page); + bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue; @@ -718,13 +721,13 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; } From a2458658316959a1756d06c2f64ba8a6f316f9de Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:44 -0800 Subject: [PATCH 258/505] nilfs2: convert nilfs_lookup_dirty_node_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 1 call to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-20-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c17db9a0c665..19446a8243d7 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -737,20 +737,19 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, { struct nilfs_inode_info *ii = NILFS_I(inode); struct inode *btnc_inode = ii->i_assoc_inode; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; if (!btnc_inode) return; + folio_batch_init(&fbatch); - pagevec_init(&pvec); - - while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btnc_inode->i_mapping, &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh) && !buffer_async_write(bh)) { @@ -761,7 +760,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, bh = bh->b_this_page; } while (bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } From 41f3f3b5373e9b7372a0ecf4814c01f62600c124 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:45 -0800 Subject: [PATCH 259/505] nilfs2: convert nilfs_btree_lookup_dirty_buffers() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 1 call to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-21-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b5f997e5e670..2681a449edc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2150,7 +2150,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; pgoff_t index = 0; int level, i; @@ -2160,19 +2160,19 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, btcache, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh)) nilfs_btree_add_dirty_buffer(btree, lists, bh); } while ((bh = bh->b_this_page) != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } From d4a16d31334e0a1dd948dfb2977f241805fd0e14 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:46 -0800 Subject: [PATCH 260/505] nilfs2: convert nilfs_copy_dirty_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 8 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-22-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 39b7eea2642a..d921542a9593 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -240,42 +240,43 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) int nilfs_copy_dirty_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; int err = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) + if (!filemap_get_folios_tag(smap, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) return 0; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; - lock_page(page); - if (unlikely(!PageDirty(page))) - NILFS_PAGE_BUG(page, "inconsistent dirty state"); + folio_lock(folio); + if (unlikely(!folio_test_dirty(folio))) + NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); - dpage = grab_cache_page(dmap, page->index); - if (unlikely(!dpage)) { + dfolio = filemap_grab_folio(dmap, folio->index); + if (unlikely(!dfolio)) { /* No empty page is added to the page cache */ err = -ENOMEM; - unlock_page(page); + folio_unlock(folio); break; } - if (unlikely(!page_has_buffers(page))) - NILFS_PAGE_BUG(page, + if (unlikely(!folio_buffers(folio))) + NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(dpage, page, 1); - __set_page_dirty_nobuffers(dpage); + nilfs_copy_page(&dfolio->page, &folio->page, 1); + filemap_dirty_folio(folio_mapping(dfolio), dfolio); - unlock_page(dpage); - put_page(dpage); - unlock_page(page); + folio_unlock(dfolio); + folio_put(dfolio); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (likely(!err)) From 243c5ea4f783922f46779f9071b1948e1c1d0291 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:47 -0800 Subject: [PATCH 261/505] nilfs2: convert nilfs_clear_dirty_pages() to use filemap_get_folios_tag() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_range_tag(). This change removes 2 calls to compound_head(). Link: https://lkml.kernel.org/r/20230104211448.4804-23-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d921542a9593..41ccd43cd979 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -358,22 +358,22 @@ repeat: */ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - nilfs_clear_dirty_page(page, silent); - unlock_page(page); + folio_lock(folio); + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } From c5792d9384113de4085dfbce6940e2a853debb67 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 4 Jan 2023 13:14:48 -0800 Subject: [PATCH 262/505] filemap: remove find_get_pages_range_tag() All callers to find_get_pages_range_tag(), find_get_pages_tag(), pagevec_lookup_range_tag(), and pagevec_lookup_tag() have been removed. Link: https://lkml.kernel.org/r/20230104211448.4804-24-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 10 ------- include/linux/pagevec.h | 8 ------ mm/filemap.c | 60 ----------------------------------------- mm/swap.c | 10 ------- 4 files changed, 88 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bb3c1d51b1cb..9f1081683771 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -741,16 +741,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages); -static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, - nr_pages, pages); -} struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2a6f61a0c10a..f582f7213ea5 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,14 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag); -static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, xa_mark_t tag) -{ - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); -} static inline void pagevec_init(struct pagevec *pvec) { diff --git a/mm/filemap.c b/mm/filemap.c index 85adbcf2d9a7..31bf18ec6d01 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2337,66 +2337,6 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); -/** - * find_get_pages_range_tag - Find and return head pages matching @tag. - * @mapping: the address_space to search - * @index: the starting page index - * @end: The final page index (inclusive) - * @tag: the tag index - * @nr_pages: the maximum number of pages - * @pages: where the resulting pages are placed - * - * Like find_get_pages_range(), except we only return head pages which are - * tagged with @tag. @index is updated to the index immediately after the - * last page we return, ready for the next iteration. - * - * Return: the number of pages which were found. - */ -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - XA_STATE(xas, &mapping->i_pages, *index); - struct folio *folio; - unsigned ret = 0; - - if (unlikely(!nr_pages)) - return 0; - - rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, tag))) { - /* - * Shadow entries should never be tagged, but this iteration - * is lockless so there is a window for page reclaim to evict - * a page we saw tagged. Skip over it. - */ - if (xa_is_value(folio)) - continue; - - pages[ret] = &folio->page; - if (++ret == nr_pages) { - *index = folio->index + folio_nr_pages(folio); - goto out; - } - } - - /* - * We come here when we got to @end. We take care to not overflow the - * index @index as it confuses some of the callers. This breaks the - * iteration when there is a page at index -1 but that is already - * broken anyway. - */ - if (end == (pgoff_t)-1) - *index = (pgoff_t)-1; - else - *index = end + 1; -out: - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(find_get_pages_range_tag); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: diff --git a/mm/swap.c b/mm/swap.c index 42d67f9baa8c..5e4f92700c16 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1115,16 +1115,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag) -{ - pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - PAGEVEC_SIZE, pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range_tag); - /* * Perform any setup for the swap system */ From 6bc56a4d855303705802c5ede4625973637484c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:09 +0000 Subject: [PATCH 263/505] mm: add vma_alloc_zeroed_movable_folio() Replace alloc_zeroed_user_highpage_movable(). The main difference is returning a folio containing a single page instead of returning the page, but take the opportunity to rename the function to match other allocation functions a little better and rewrite the documentation to place more emphasis on the zeroing rather than the highmem aspect. Link: https://lkml.kernel.org/r/20230116191813.2145215-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 5 ++--- arch/arm64/include/asm/page.h | 4 ++-- arch/arm64/mm/fault.c | 4 ++-- arch/ia64/include/asm/page.h | 14 ++++++-------- arch/m68k/include/asm/page_no.h | 5 ++--- arch/s390/include/asm/page.h | 5 ++--- arch/x86/include/asm/page.h | 5 ++--- include/linux/highmem.h | 33 ++++++++++++++++----------------- mm/memory.c | 16 ++++++++++------ 9 files changed, 44 insertions(+), 47 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 8f3f5eecba28..bc5256fba8f0 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -17,9 +17,8 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 993a27ea6f54..2312e6ee595f 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -29,9 +29,9 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_highpage(struct page *to, struct page *from); #define __HAVE_ARCH_COPY_HIGHPAGE -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio void tag_clear_highpage(struct page *to); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 596f46dabe4e..f4cb0f85ccf4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -925,7 +925,7 @@ NOKPROBE_SYMBOL(do_debug_exception); /* * Used during anonymous page fault handling. */ -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; @@ -938,7 +938,7 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return alloc_page_vma(flags, vma, vaddr); + return vma_alloc_folio(flags, 0, vma, vaddr, false); } void tag_clear_highpage(struct page *page) diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index 1b990466d540..ba0b365cf2b2 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -82,17 +82,15 @@ do { \ } while (0) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ ({ \ - struct page *page = alloc_page_vma( \ - GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr); \ - if (page) \ - flush_dcache_page(page); \ - page; \ + struct folio *folio = vma_alloc_folio( \ + GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false); \ + if (folio) \ + flush_dcache_folio(folio); \ + folio; \ }) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE - #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index c9d0d84158a4..abd2c3aeb015 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -13,9 +13,8 @@ extern unsigned long memory_end; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 61dea67bb9c7..8a2a3b5d1e29 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -73,9 +73,8 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9cc82f305f4b..d18e5c332cb9 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d7097b8158f2..e22509420ac6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -207,31 +207,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#ifndef vma_alloc_zeroed_movable_folio /** - * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move - * @vma: The VMA the page is to be allocated for - * @vaddr: The virtual address the page will be inserted into + * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. + * @vma: The VMA the page is to be allocated for. + * @vaddr: The virtual address the page will be inserted into. * - * Returns: The allocated and zeroed HIGHMEM page + * This function will allocate a page suitable for inserting into this + * VMA at this virtual address. It may be allocated from highmem or + * the movable zone. An architecture may provide its own implementation. * - * This function will allocate a page for a VMA that the caller knows will - * be able to migrate in the future using move_pages() or reclaimed - * - * An architecture may override this function by defining - * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own - * implementation. + * Return: A folio containing one allocated and zeroed page or NULL if + * we are out of memory. */ -static inline struct page * -alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +static inline +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + struct folio *folio; - if (page) - clear_user_highpage(page, vaddr); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + if (folio) + clear_user_highpage(&folio->page, vaddr); - return page; + return folio; } #endif diff --git a/mm/memory.c b/mm/memory.c index 87b33b4967c2..b6358ffbccaa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3056,10 +3056,12 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, - vmf->address); - if (!new_page) + struct folio *new_folio; + + new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!new_folio) goto oom; + new_page = &new_folio->page; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); @@ -3995,6 +3997,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; + struct folio *folio; vm_fault_t ret = 0; pte_t entry; @@ -4044,11 +4047,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, vmf->address); - if (!page) + folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!folio) goto oom; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + page = &folio->page; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); From cb3184deef10fdc7658fb366189864c89ad118c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:10 +0000 Subject: [PATCH 264/505] mm: convert do_anonymous_page() to use a folio Removes six calls to compound_head(); some inline and some external. Link: https://lkml.kernel.org/r/20230116191813.2145215-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b6358ffbccaa..950e5a4f2cf1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3996,7 +3996,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page; struct folio *folio; vm_fault_t ret = 0; pte_t entry; @@ -4051,19 +4050,18 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!folio) goto oom; - page = &folio->page; if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; - cgroup_throttle_swaprate(page, GFP_KERNEL); + cgroup_throttle_swaprate(&folio->page, GFP_KERNEL); /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); - entry = mk_pte(page, vma->vm_page_prot); + entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -4082,13 +4080,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(page); + folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_lru_vma(folio, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -4098,10 +4096,10 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - put_page(page); + folio_put(folio); goto unlock; oom_free_page: - put_page(page); + folio_put(folio); oom: return VM_FAULT_OOM; } From 28d41a4863316321bb5aa616bd82d65c84fc0f8b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:11 +0000 Subject: [PATCH 265/505] mm: convert wp_page_copy() to use folios Use new_folio instead of new_page throughout, because we allocated it and know it's an order-0 folio. Most old_page uses become old_folio, but use vmf->page where we need the precise page. Link: https://lkml.kernel.org/r/20230116191813.2145215-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 65 ++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 950e5a4f2cf1..720fbb4771de 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3043,8 +3043,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; - struct page *old_page = vmf->page; - struct page *new_page = NULL; + struct folio *old_folio = NULL; + struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; @@ -3052,23 +3052,22 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_start(); + if (vmf->page) + old_folio = page_folio(vmf->page); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - struct folio *new_folio; - new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); if (!new_folio) goto oom; - new_page = &new_folio->page; } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!new_folio) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); + ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (ret) { /* * COW failed, if the fault was solved by other, @@ -3077,21 +3076,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * from the second attempt. * The -EHWPOISON case will not be retried. */ - put_page(new_page); - if (old_page) - put_page(old_page); + folio_put(new_folio); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } - kmsan_copy_page_meta(new_page, old_page); + kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) + if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) goto oom_free_new; - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, @@ -3103,16 +3102,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + if (old_folio) { + if (!folio_test_anon(old_folio)) { + dec_mm_counter(mm, mm_counter_file(&old_folio->page)); inc_mm_counter(mm, MM_ANONPAGES); } } else { inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); + entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3131,8 +3130,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3141,7 +3140,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); - if (old_page) { + if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another @@ -3164,18 +3163,18 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma, false); + page_remove_rmap(vmf->page, vma, false); } /* Free the old page.. */ - new_page = old_page; + new_folio = old_folio; page_copied = 1; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); } - if (new_page) - put_page(new_page); + if (new_folio) + folio_put(new_folio); pte_unmap_unlock(vmf->pte, vmf->ptl); /* @@ -3183,19 +3182,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); - if (old_page) { + if (old_folio) { if (page_copied) - free_swap_cache(old_page); - put_page(old_page); + free_swap_cache(&old_folio->page); + folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom_free_new: - put_page(new_page); + folio_put(new_folio); oom: - if (old_page) - put_page(old_page); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return VM_FAULT_OOM; From edf5047058395c89a912783ea29ec8f9e53be414 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:12 +0000 Subject: [PATCH 266/505] mm: use a folio in copy_pte_range() Allocate an order-0 folio instead of a page and pass it all the way down the call chain. Removes dozens of calls to compound_head(). Link: https://lkml.kernel.org/r/20230116191813.2145215-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 720fbb4771de..20a5c8087a5a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -863,13 +863,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc, struct page *page) + struct folio **prealloc, struct page *page) { - struct page *new_page; + struct folio *new_folio; pte_t pte; - new_page = *prealloc; - if (!new_page) + new_folio = *prealloc; + if (!new_folio) return -EAGAIN; /* @@ -877,14 +877,14 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, src_vma); - __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, dst_vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, dst_vma); - rss[mm_counter(new_page)]++; + copy_user_highpage(&new_folio->page, page, addr, src_vma); + __folio_mark_uptodate(new_folio); + folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_lru_vma(new_folio, dst_vma); + rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -900,7 +900,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc) + struct folio **prealloc) { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; @@ -922,11 +922,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } - rss[mm_counter(page)]++; + rss[MM_ANONPAGES]++; } else if (page) { get_page(page); page_dup_file_rmap(page, false); - rss[mm_counter(page)]++; + rss[mm_counter_file(page)]++; } /* @@ -954,23 +954,22 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } -static inline struct page * -page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, - unsigned long addr) +static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr) { - struct page *new_page; + struct folio *new_folio; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (!new_folio) return NULL; - if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { - put_page(new_page); + if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { + folio_put(new_folio); return NULL; } - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - return new_page; + return new_folio; } static int @@ -986,7 +985,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; - struct page *prealloc = NULL; + struct folio *prealloc = NULL; again: progress = 0; @@ -1056,7 +1055,7 @@ again: * will allocate page according to address). This * could only happen if one pinned pte changed. */ - put_page(prealloc); + folio_put(prealloc); prealloc = NULL; } progress += 8; @@ -1093,7 +1092,7 @@ again: goto again; out: if (unlikely(prealloc)) - put_page(prealloc); + folio_put(prealloc); return ret; } From 14ddee4126fecff5c5c0a84940ba34f0bfe3e708 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:18:13 +0000 Subject: [PATCH 267/505] mm: use a folio in copy_present_pte() We still have to keep the page around because we need to know which page in the folio we're copying, but we can replace five implict calls to compound_head() with one. Link: https://lkml.kernel.org/r/20230116191813.2145215-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 20a5c8087a5a..029f838587d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -906,25 +906,28 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; + struct folio *folio; page = vm_normal_page(src_vma, addr, pte); - if (page && PageAnon(page)) { + if (page) + folio = page_folio(page); + if (page && folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - get_page(page); + folio_get(folio); if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { - /* Page maybe pinned, we have to copy. */ - put_page(page); + /* Page may be pinned, we have to copy. */ + folio_put(folio); return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } rss[MM_ANONPAGES]++; } else if (page) { - get_page(page); + folio_get(folio); page_dup_file_rmap(page, false); rss[mm_counter_file(page)]++; } @@ -937,7 +940,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } - VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); + VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in From 9cfb816b1c6c99f4b3c1d4a0fb096162cd17ec71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:25:06 +0000 Subject: [PATCH 268/505] mm/fs: convert inode_attach_wb() to take a folio Patch series "Writeback folio conversions". Remove more calls to compound_head() by passing folios around instead of pages. This patch (of 2): The only caller of inode_attach_wb() which doesn't pass NULL already has a folio, so convert the whole call-chain to take folios. Link: https://lkml.kernel.org/r/20230116192507.2146150-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116192507.2146150-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 6 +++--- include/linux/writeback.h | 12 ++++++------ mm/page-writeback.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6fba5a52127b..12f60f1ed2a0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -237,7 +237,7 @@ void wb_wait_for_completion(struct wb_completion *done) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; -void __inode_attach_wb(struct inode *inode, struct page *page) +void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; @@ -245,8 +245,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (page) { - memcg_css = mem_cgroup_css_from_page(page); + if (folio) { + memcg_css = mem_cgroup_css_from_page(&folio->page); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2554b71765e9..3f1491b07474 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,7 +207,7 @@ static inline void wait_on_inode(struct inode *inode) #include #include -void __inode_attach_wb(struct inode *inode, struct page *page); +void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); @@ -222,16 +222,16 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest - * @page: page being dirtied (may be NULL) + * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) - __inode_attach_wb(inode, page); + __inode_attach_wb(inode, folio); } /** @@ -290,7 +290,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) #else /* CONFIG_CGROUP_WRITEBACK */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5e892f20bed7..92b90d2ab513 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2652,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio, struct bdi_writeback *wb; long nr = folio_nr_pages(folio); - inode_attach_wb(inode, &folio->page); + inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); From 75376c6fb93b99e94192cfff48222d11819ee917 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:25:07 +0000 Subject: [PATCH 269/505] mm: convert mem_cgroup_css_from_page() to mem_cgroup_css_from_folio() Only one caller doesn't have a folio, so move the page_folio() call to that one caller from mem_cgroup_css_from_folio(). Link: https://lkml.kernel.org/r/20230116192507.2146150-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 6 ++++-- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 12 +++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 12f60f1ed2a0..195dc23e0d83 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -246,7 +246,7 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) struct cgroup_subsys_state *memcg_css; if (folio) { - memcg_css = mem_cgroup_css_from_page(&folio->page); + memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ @@ -859,6 +859,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -871,7 +872,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_page(page); + folio = page_folio(page); + css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e605fc885f08..35478695cabf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -890,7 +890,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index faeea84964aa..893427aded01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -350,21 +350,19 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page - * @page: page of interest + * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated - * with @page is returned. The returned css remains associated with @page + * with @folio is returned. The returned css remains associated with @folio * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg; - - memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; From 90c9d13a47d45f2f16530c4d62af2fa4d74dfd16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:24 +0000 Subject: [PATCH 270/505] mm: remove page_evictable() Patch series "Remove leftover mlock/munlock page wrappers". We no longer need the various mlock page functions as all callers have folios. This patch (of 4): This function now has no users. Also update the unevictable-lru documentation to discuss folios instead of pages (mostly). [akpm@linux-foundation.org: fix Documentation/mm/unevictable-lru.rst underlining] Link: https://lkml.kernel.org/r/20230117145106.585b277b@canb.auug.org.au Link: https://lkml.kernel.org/r/20230116192827.2146732-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116192827.2146732-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 91 ++++++++++++++-------------- mm/internal.h | 11 ---- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 2a90d0721dd9..552c63eef954 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -12,7 +12,7 @@ Introduction This document describes the Linux memory manager's "Unevictable LRU" infrastructure and the use of this to manage several types of "unevictable" -pages. +folios. The document attempts to provide the overall rationale behind this mechanism and the rationale for some of the design decisions that drove the @@ -27,8 +27,8 @@ The Unevictable LRU =================== The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page +folios and to hide these folios from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with folio reclaim in Linux. The problems have been observed at customer sites on large memory x86_64 systems. @@ -52,40 +52,41 @@ The infrastructure may also be able to handle other conditions that make pages unevictable, either by definition or by circumstance, in the future. -The Unevictable LRU Page List ------------------------------ +The Unevictable LRU Folio List +------------------------------ -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. +The Unevictable LRU folio list is a lie. It was never an LRU-ordered +list, but a companion to the LRU-ordered anonymous and file, active and +inactive folio lists; and now it is not even a folio list. But following +familiar convention, here in this document and in the source, we often +imagine it as a fifth LRU folio list. The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. +called the "unevictable" list and an associated folio flag, PG_unevictable, to +indicate that the folio is being managed on the unevictable list. The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when +PG_active flag in that it indicates on which LRU list a folio resides when PG_lru is set. -The Unevictable LRU infrastructure maintains unevictable pages as if they were +The Unevictable LRU infrastructure maintains unevictable folios as if they were on an additional LRU list for a few reasons: - (1) We get to "treat unevictable pages just like we treat other pages in the + (1) We get to "treat unevictable folios just like we treat other folios in the system - which means we get to use the same code to manipulate them, the same code to isolate them (for migrate, etc.), the same code to keep track of the statistics, etc..." [Rik van Riel] - (2) We want to be able to migrate unevictable pages between nodes for memory + (2) We want to be able to migrate unevictable folios between nodes for memory defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU + can only migrate folios that it can successfully isolate from the LRU lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. + maintain folios elsewhere than on an LRU-like list, where they can be + detected by folio_isolate_lru(), we would prevent their migration. -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. +The unevictable list does not differentiate between file-backed and +anonymous, swap-backed folios. This differentiation is only important +while the folios are, in fact, evictable. The unevictable list benefits from the "arrayification" of the per-node LRU lists and statistics originally proposed and posted by Christoph Lameter. @@ -158,7 +159,7 @@ These are currently used in three places in the kernel: Detecting Unevictable Pages --------------------------- -The function page_evictable() in mm/internal.h determines whether a page is +The function folio_evictable() in mm/internal.h determines whether a folio is evictable or not using the query function outlined above [see section :ref:`Marking address spaces unevictable `] to check the AS_UNEVICTABLE flag. @@ -167,7 +168,7 @@ For address spaces that are so marked after being populated (as SHM regions might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate the page tables for the region as does, for example, mlock(), nor need it make any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during +list. Instead, vmscan will do this if and when it encounters the folios during a reclamation scan. On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan @@ -176,41 +177,43 @@ condition is keeping them unevictable. If an unevictable region is destroyed, the pages are also "rescued" from the unevictable list in the process of freeing them. -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. +folio_evictable() also checks for mlocked folios by calling +folio_test_mlocked(), which is set when a folio is faulted into a +VM_LOCKED VMA, or found in a VMA being VM_LOCKED. -Vmscan's Handling of Unevictable Pages --------------------------------------- +Vmscan's Handling of Unevictable Folios +--------------------------------------- -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they +If unevictable folios are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the folios until they have become evictable again (via munlock() for example) and have been "rescued" from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular +for the sake of expediency, to leave an unevictable folio on one of the regular active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the +folios in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such folios that it encounters: that is, it diverts those folios to the unevictable list for the memory cgroup and node being scanned. -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in folio_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. +There may be situations where a folio is mapped into a VM_LOCKED VMA, +but the folio does not have the mlocked flag set. Such folios will make +it all the way to shrink_active_list() or shrink_page_list() where they +will be detected when vmscan walks the reverse map in folio_referenced() +or try_to_unmap(). The folio is culled to the unevictable list when it +is released by the shrinker. -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. +To "cull" an unevictable folio, vmscan simply puts the folio back on +the LRU list using folio_putback_lru() - the inverse operation to +folio_isolate_lru() - after dropping the folio lock. Because the +condition which makes the folio unevictable may change once the folio +is unlocked, __pagevec_lru_add_fn() will recheck the unevictable state +of a folio before placing it on the unevictable list. MLOCKED Pages ============= -The unevictable page list is also useful for mlock(), in addition to ramfs and +The unevictable folio list is also useful for mlock(), in addition to ramfs and SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in NOMMU situations, all mappings are effectively mlocked. diff --git a/mm/internal.h b/mm/internal.h index 2d09a7a0600a..74bc1fe45711 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -159,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio) return ret; } -static inline bool page_evictable(struct page *page) -{ - bool ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. From 7efecffb8e7968c4a6c53177b0053ca4765fe233 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:25 +0000 Subject: [PATCH 271/505] mm: remove mlock_vma_page() All callers now have a folio and can call mlock_vma_folio(). Update the documentation to refer to mlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 6 +++--- mm/internal.h | 10 +--------- mm/mlock.c | 4 ++-- mm/rmap.c | 4 ++-- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 552c63eef954..9257235fe904 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -311,7 +311,7 @@ do end up getting faulted into this VM_LOCKED VMA, they will be handled in the fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_folio() when the VMA is VM_LOCKED +calls mlock_vma_folio(), which calls mlock_folio() when the VMA is VM_LOCKED (unless it is a PTE mapping of a part of a transparent huge page). Or when it is a newly allocated anonymous page, folio_add_lru_vma() calls mlock_new_folio() instead: similar to mlock_folio(), but can make better @@ -413,7 +413,7 @@ However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, before mlocking any pages already present, if one of those pages were migrated before mlock_pte_range() reached it, it would get counted twice in mlock_count. To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. +so that mlock_vma_folio() will skip it. To complete page migration, we place the old and new pages back onto the LRU afterwards. The "unneeded" page - old page on success, new page on failure - @@ -552,6 +552,6 @@ and node unevictable list. rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_folio() to correct them. Such pages are culled to the unevictable list when released by the shrinker. diff --git a/mm/internal.h b/mm/internal.h index 74bc1fe45711..0b74105ea363 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -518,7 +518,7 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* - * mlock_vma_page() and munlock_vma_page(): + * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * @@ -547,12 +547,6 @@ static inline void mlock_vma_folio(struct folio *folio, mlock_folio(folio); } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - mlock_vma_folio(page_folio(page), vma, compound); -} - void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, @@ -656,8 +650,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } diff --git a/mm/mlock.c b/mm/mlock.c index 9e9c8be58277..b680f11879c3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -370,9 +370,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, - * will call mlock_vma_page() and raise page's mlock_count: + * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. - * Communicate this danger to mlock_vma_page() with VM_IO, + * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; diff --git a/mm/rmap.c b/mm/rmap.c index 0d07c500fc86..33e15181ae73 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1260,7 +1260,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __page_check_anon_rmap(page, vma, address); } - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** @@ -1351,7 +1351,7 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** From 672aa27d0bd241759376e62b78abb8aae1792479 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:26 +0000 Subject: [PATCH 272/505] mm: remove munlock_vma_page() All callers now have a folio and can call munlock_vma_folio(). Update the documentation to refer to munlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 4 ++-- kernel/events/uprobes.c | 1 - mm/internal.h | 8 -------- mm/rmap.c | 12 ++++++------ 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 9257235fe904..34b8b098c5bc 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -486,7 +486,7 @@ Before the unevictable/mlock changes, mlocking did not mark the pages in any way, so unmapping them required no processing. For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). munlock_page() uses the mlock pagevec to batch up work to be done under @@ -510,7 +510,7 @@ which had been Copied-On-Write from the file pages now being truncated. Mlocked pages can be munlocked and deleted in this way: like with munmap(), for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). However, if there is a racing munlock(), since mlock_vma_pages_range() starts diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 29f36d2ae129..1a3904e0179c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -22,7 +22,6 @@ #include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include #include #include diff --git a/mm/internal.h b/mm/internal.h index 0b74105ea363..ce462bf145b4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -548,7 +548,6 @@ static inline void mlock_vma_folio(struct folio *folio, } void munlock_folio(struct folio *folio); - static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { @@ -557,11 +556,6 @@ static inline void munlock_vma_folio(struct folio *folio, munlock_folio(folio); } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - munlock_vma_folio(page_folio(page), vma, compound); -} void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); @@ -650,8 +644,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } diff --git a/mm/rmap.c b/mm/rmap.c index 33e15181ae73..0b5abdda1e6b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,14 +1431,14 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, } /* - * It would be tidy to reset PageAnon mapping when fully unmapped, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_pages_prepare, - * and remember that it's only reliable while mapped. + * It would be tidy to reset folio_test_anon mapping when fully + * unmapped, but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping before us: + * so leave the reset to free_pages_prepare, and remember that + * it's only reliable while mapped. */ - munlock_vma_page(page, vma, compound); + munlock_vma_folio(folio, vma, compound); } /* From e0650a41f7d024b72669a2a2db846ef70281abd8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:27 +0000 Subject: [PATCH 273/505] mm: clean up mlock_page / munlock_page references in comments Change documentation and comments that refer to now-renamed functions. Link: https://lkml.kernel.org/r/20230116192827.2146732-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 30 +++++++++++++++------------- mm/memory-failure.c | 2 +- mm/swap.c | 4 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 34b8b098c5bc..53e59433497a 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -298,7 +298,7 @@ treated as a no-op and mlock_fixup() simply returns. If the VMA passes some filtering as described in "Filtering Special VMAs" below, mlock_fixup() will attempt to merge the VMA with its neighbors or split off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via +already present in the VMA are then marked as mlocked by mlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). Before returning from the system call, do_mlock() or mlockall() will call @@ -373,20 +373,21 @@ Because of the VMA filtering discussed above, VM_LOCKED will not be set in any "special" VMAs. So, those VMAs will be ignored for munlock. If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via +specified range. All pages in the VMA are then munlocked by munlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same function used when mlocking a VMA range, with new flags for the VMA indicating that it is munlock() being performed. -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PG_mlocked and clears -PG_unevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. @@ -489,15 +490,16 @@ For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PG_mlocked and clears -PG_unevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ba0bbfc074ee..38f84bff8bdf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2167,7 +2167,7 @@ try_again: } /* - * __munlock_pagevec may clear a writeback page's LRU flag without + * __munlock_folio() may clear a writeback page's LRU flag without * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ diff --git a/mm/swap.c b/mm/swap.c index 5e4f92700c16..2a51faa34e64 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears the mlocked flag + * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -216,7 +216,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another + * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. From 5b4bd90f9ac76136c7148684b12276d4ae2d64a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:29:59 +0000 Subject: [PATCH 274/505] rmap: add folio parameter to __page_set_anon_rmap() Avoid the compound_head() call in PageAnon() by passing in the folio that all callers have. Also save me from wondering whether page->mapping can ever be overwritten on a tail page (I don't think it can, but I'm not 100% sure). Link: https://lkml.kernel.org/r/20230116192959.2147032-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/rmap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0b5abdda1e6b..43760d622040 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1131,19 +1131,20 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page or Hugepage to add to rmap + * @folio: Folio which contains page. + * @page: Page to add to rmap. * @vma: VM area to add page to. * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ -static void __page_set_anon_rmap(struct page *page, +static void __page_set_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); - if (PageAnon(page)) + if (folio_test_anon(folio)) goto out; /* @@ -1155,14 +1156,14 @@ static void __page_set_anon_rmap(struct page *page, anon_vma = anon_vma->root; /* - * page_idle does a lockless/optimistic rmap scan on page->mapping. + * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - page->index = linear_page_index(vma, address); + WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); + folio->index = linear_page_index(vma, address); out: if (exclusive) SetPageAnonExclusive(page); @@ -1254,7 +1255,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (likely(!folio_test_ksm(folio))) { /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(page, vma, address); @@ -1297,7 +1298,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(&folio->page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } /** @@ -2528,7 +2529,7 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); } @@ -2541,6 +2542,6 @@ void hugepage_add_new_anon_rmap(struct page *page, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(page, vma, address, 1); + __page_set_anon_rmap(folio, page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ From 8808ecab3afc18958a9216911cd7967017e7057c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:39 +0000 Subject: [PATCH 275/505] filemap: convert filemap_map_pmd() to take a folio Patch series "Some more filemap folio conversions". Three more places which could easily be converted to folios. The third one fixes a minor bug in readahead_expand(), but it's only a performance bug and there are few users of readahead_expand(), so I don't think it's worth backporting. This patch (of 3): Save a few calls to compound_head(). We specify exactly which page from the folio to use by passing in start_pgoff, which means this will work for a folio which is larger than PMD size. The rest of the VM isn't prepared for that yet, but now this function is. Link: https://lkml.kernel.org/r/20230116193941.2148487-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116193941.2148487-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/filemap.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 31bf18ec6d01..b6b7efc9abc0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3259,22 +3259,24 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) +static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, + pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } - if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { + struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ - unlock_page(page); + folio_unlock(folio); return true; } } @@ -3284,8 +3286,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } @@ -3368,7 +3370,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, &folio->page)) { + if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } From eff3b364b496e01ec789f3e15a51f9a3589e2a23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:40 +0000 Subject: [PATCH 276/505] filemap: convert filemap_range_has_page() to use a folio The folio isn't returned from this function, so this is an entirely internal change. Link: https://lkml.kernel.org/r/20230116193941.2148487-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/filemap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index b6b7efc9abc0..c915ded191f0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -470,7 +470,7 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - struct page *page; + struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -479,11 +479,11 @@ bool filemap_range_has_page(struct address_space *mapping, rcu_read_lock(); for (;;) { - page = xas_find(&xas, max); - if (xas_retry(&xas, page)) + folio = xas_find(&xas, max); + if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to @@ -494,7 +494,7 @@ bool filemap_range_has_page(struct address_space *mapping, } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); From 11a980420719712f419dbb325940907f5d1afbdd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:39:41 +0000 Subject: [PATCH 277/505] readahead: convert readahead_expand() to use a folio Replace the uses of page with a folio. Also add a missing test for workingset in the leading edge expansion. Link: https://lkml.kernel.org/r/20230116193941.2148487-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Signed-off-by: Andrew Morton --- mm/readahead.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index b10f0cf81d80..47afbca1d122 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -801,21 +801,25 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; - ractl->_index = page->index; + ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); @@ -824,19 +828,20 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } From 98001fd63d59d2f99c90db823d322de91ff7d771 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 16 Jan 2023 16:43:32 +0000 Subject: [PATCH 278/505] mm/secretmem: remove redundant initiialization of pointer file The pointer file is being initialized with a value that is never read, it is being re-assigned later on. Clean up code by removing the redundant initialization. Link: https://lkml.kernel.org/r/20230116164332.79500-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 04c3ac9448a1..be3fff86ba00 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -190,7 +190,7 @@ static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { - struct file *file = ERR_PTR(-ENOMEM); + struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); From 64517d6e1291b5e942b00c53674ecf33f918313f Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 16 Jan 2023 14:23:47 +0800 Subject: [PATCH 279/505] mm/damon/core: skip apply schemes if empty Sometimes there is no scheme in damon's context, for example just use damo record to monitor workload's data access pattern. If current damon context doesn't have any scheme in the list, kdamond has no need to iterate over list of all targets and regions but do nothing. So, skip apply schemes when ctx->schemes is empty. Link: https://lkml.kernel.org/r/20230116062347.1148553-1-huaisheng.ye@intel.com Signed-off-by: Huaisheng Ye Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 1bf0654ae189..2db8c53491ca 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1269,7 +1269,8 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - kdamond_apply_schemes(ctx); + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) From 7ec7096b8577d3b899c1dae456a414f2d08c7ddb Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 17 Jan 2023 20:46:17 +0000 Subject: [PATCH 280/505] mm/page_ext: init page_ext early if there are no deferred struct pages page_ext must be initialized after all struct pages are initialized. Therefore, page_ext is initialized after page_alloc_init_late(), and can optionally be initialized earlier via early_page_ext kernel parameter which as a side effect also disables deferred struct pages. Allow to automatically init page_ext early when there are no deferred struct pages in order to be able to use page_ext during kernel boot and track for example page allocations early. [pasha.tatashin@soleen.com: fix build with CONFIG_PAGE_EXTENSION=n] Link: https://lkml.kernel.org/r/20230118155251.2522985-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20230117204617.1553748-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Li Zhe Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 2 ++ init/main.c | 6 +++--- mm/page_alloc.c | 6 +++++- mm/page_ext.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648aeb..bc2e39090a1f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -29,6 +29,8 @@ struct page_ext_operations { bool need_shared_flags; }; +extern bool deferred_struct_pages; + #ifdef CONFIG_PAGE_EXTENSION /* diff --git a/init/main.c b/init/main.c index e1c3911d7c70..64cd2ff051c4 100644 --- a/init/main.c +++ b/init/main.c @@ -855,8 +855,8 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - /* Should be run after vmap initialization */ - if (early_page_ext_enabled()) + /* If no deferred init page_ext now, as vmap is fully initialized */ + if (!deferred_struct_pages) page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); @@ -1628,7 +1628,7 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - if (!early_page_ext_enabled()) + if (deferred_struct_pages) page_ext_init(); do_basic_setup(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0cfad30fb44c..ecb9e9acfe7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +bool deferred_struct_pages __meminitdata; + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once @@ -6803,8 +6805,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, zone_end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) { + deferred_struct_pages = true; break; + } } page = pfn_to_page(pfn); diff --git a/mm/page_ext.c b/mm/page_ext.c index e2c22ffdbb81..dc1626be458b 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -92,7 +92,7 @@ unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); -bool early_page_ext; +bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; From 076cf7ea67010d11a97912423f4cdbeff1bd1f5f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 5 Jan 2023 13:55:06 +0530 Subject: [PATCH 281/505] mm/page_alloc: use deferred_pages_enabled() wherever applicable Instead of directly accessing static deferred_pages, replace such instances with the helper deferred_pages_enabled(). No functional change is intended. Link: https://lkml.kernel.org/r/20230105082506.241529-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecb9e9acfe7f..717f12e83b85 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4288,7 +4288,7 @@ retry: * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4337,7 +4337,7 @@ try_this_zone: } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } From 6260ae3583456808dceb4d78077e6388c49ca6d7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:07 +0900 Subject: [PATCH 282/505] zsmalloc: rework zspage chain size selection Patch series "zsmalloc: make zspage chain size configurable". Computers are bad at division. We currently decide the best zspage chain size (max number of physical pages per-zspage) by looking at a `used percentage` value. This is not enough as we lose precision during usage percentage calculations For example, let's look at size class 208: pages per zspage wasted bytes used% 1 144 96 2 80 99 3 16 99 4 160 99 Current algorithm will select 2 page per zspage configuration, as it's the first one to reach 99%. However, 3 pages per zspage waste less memory. Change algorithm and select zspage configuration that has lowest wasted value. Link: https://lkml.kernel.org/r/20230118005210.2814763-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20230118005210.2814763-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 56 +++++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9d27d9b00bce..00ab4cca49e4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -822,42 +822,6 @@ out: return newfg; } -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp % class_size - * usage = Zp - wastage - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - static struct zspage *get_zspage(struct page *page) { struct zspage *zspage = (struct zspage *)page_private(page); @@ -2401,6 +2365,24 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->name); } +static int calculate_zspage_chain_size(int class_size) +{ + int i, min_waste = INT_MAX; + int chain_size = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int waste; + + waste = (i * PAGE_SIZE) % class_size; + if (waste < min_waste) { + min_waste = waste; + chain_size = i; + } + } + + return chain_size; +} + /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created @@ -2445,7 +2427,7 @@ struct zs_pool *zs_create_pool(const char *name) size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; - pages_per_zspage = get_pages_per_zspage(size); + pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* From e1d1f3546913ae0af9da31df8183a6f3da0cd590 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:08 +0900 Subject: [PATCH 283/505] zsmalloc: skip chain size calculation for pow_of_2 classes If a class size is power of 2 then it wastes no memory and the best configuration is 1 physical page per-zspage. Link: https://lkml.kernel.org/r/20230118005210.2814763-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 00ab4cca49e4..7b904f9bed70 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2370,6 +2370,9 @@ static int calculate_zspage_chain_size(int class_size) int i, min_waste = INT_MAX; int chain_size = 1; + if (is_power_of_2(class_size)) + return chain_size; + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { int waste; From 4ff93b292c0b91b9a7c9fb54643f122bbe59d8d0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:09 +0900 Subject: [PATCH 284/505] zsmalloc: make zspage chain size configurable Remove hard coded limit on the maximum number of physical pages per-zspage. This will allow tuning of zsmalloc pool as zspage chain size changes `pages per-zspage` and `objects per-zspage` characteristics of size classes which also affects size classes clustering (the way size classes are merged). Link: https://lkml.kernel.org/r/20230118005210.2814763-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- Documentation/mm/zsmalloc.rst | 168 ++++++++++++++++++++++++++++++++++ mm/Kconfig | 19 ++++ mm/zsmalloc.c | 12 +-- 3 files changed, 191 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index 6e79893d6132..40323c9b39d8 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -80,3 +80,171 @@ Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f * ZS_EMPTY when n == 0 * ZS_FULL when n == N + + +Internals +========= + +zsmalloc has 255 size classes, each of which can hold a number of zspages. +Each zspage can contain up to ZSMALLOC_CHAIN_SIZE physical (0-order) pages. +The optimal zspage chain size for each size class is calculated during the +creation of the zsmalloc pool (see calculate_zspage_chain_size()). + +As an optimization, zsmalloc merges size classes that have similar +characteristics in terms of the number of pages per zspage and the number +of objects that each zspage can store. + +For instance, consider the following size classes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 94 1536 0 0 0 0 0 3 0 + 100 1632 0 0 0 0 0 2 0 + ... + + +Size classes #95-99 are merged with size class #100. This means that when we +need to store an object of size, say, 1568 bytes, we end up using size class +#100 instead of size class #96. Size class #100 is meant for objects of size +1632 bytes, so each object of size 1568 bytes wastes 1632-1568=64 bytes. + +Size class #100 consists of zspages with 2 physical pages each, which can +hold a total of 5 objects. If we need to store 13 objects of size 1568, we +end up allocating three zspages, or 6 physical pages. + +However, if we take a closer look at size class #96 (which is meant for +objects of size 1568 bytes) and trace `calculate_zspage_chain_size()`, we +find that the most optimal zspage configuration for this class is a chain +of 5 physical pages::: + + pages per zspage wasted bytes used% + 1 960 76 + 2 352 95 + 3 1312 89 + 4 704 95 + 5 96 99 + +This means that a class #96 configuration with 5 physical pages can store 13 +objects of size 1568 in a single zspage, using a total of 5 physical pages. +This is more efficient than the class #100 configuration, which would use 6 +physical pages to store the same number of objects. + +As the zspage chain size for class #96 increases, its key characteristics +such as pages per-zspage and objects per-zspage also change. This leads to +dewer class mergers, resulting in a more compact grouping of classes, which +reduces memory wastage. + +Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages +per zspage. Any object larger than 3264 bytes is considered huge and belongs +to size class #254, which stores each object in its own physical page (objects +in huge classes do not share pages). + +Increasing the size of the chain of zspages also results in a higher watermark +for the huge size class and fewer huge classes overall. This allows for more +efficient storage of large objects. + +For zspage chain size of 8, huge class watermark becomes 3632 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 211 3408 0 0 0 0 0 5 0 + 217 3504 0 0 0 0 0 6 0 + 222 3584 0 0 0 0 0 7 0 + 225 3632 0 0 0 0 0 8 0 + 254 4096 0 0 0 0 0 1 0 + ... + +For zspage chain size of 16, huge class watermark becomes 3840 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 206 3328 0 0 0 0 0 13 0 + 207 3344 0 0 0 0 0 9 0 + 208 3360 0 0 0 0 0 14 0 + 211 3408 0 0 0 0 0 5 0 + 212 3424 0 0 0 0 0 16 0 + 214 3456 0 0 0 0 0 11 0 + 217 3504 0 0 0 0 0 6 0 + 219 3536 0 0 0 0 0 13 0 + 222 3584 0 0 0 0 0 7 0 + 223 3600 0 0 0 0 0 15 0 + 225 3632 0 0 0 0 0 8 0 + 228 3680 0 0 0 0 0 9 0 + 230 3712 0 0 0 0 0 10 0 + 232 3744 0 0 0 0 0 11 0 + 234 3776 0 0 0 0 0 12 0 + 235 3792 0 0 0 0 0 13 0 + 236 3808 0 0 0 0 0 14 0 + 238 3840 0 0 0 0 0 15 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Overall the combined zspage chain size effect on zsmalloc pool configuration::: + + pages per zspage number of size classes (clusters) huge size class watermark + 4 69 3264 + 5 86 3408 + 6 93 3504 + 7 112 3584 + 8 123 3632 + 9 140 3680 + 10 143 3712 + 11 159 3744 + 12 164 3776 + 13 180 3792 + 14 183 3808 + 15 188 3840 + 16 191 3840 + + +A synthetic test +---------------- + +zram as a build artifacts storage (Linux kernel compilation). + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=4` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 13 51 413836 412973 159955 3 + + zram mm_stat::: + + 1691783168 628083717 655175680 0 655175680 60 0 34048 34049 + + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=8` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 18 87 414852 412978 156666 0 + + zram mm_stat::: + + 1691803648 627793930 641703936 0 641703936 60 0 33591 33591 + +Using larger zspage chains may result in using fewer physical pages, as seen +in the example where the number of physical pages used decreased from 159955 +to 156666, at the same time maximum zsmalloc pool memory usage went down from +655175680 to 641703936 bytes. + +However, this advantage may be offset by the potential for increased system +memory pressure (as some zspages have larger chain sizes) in cases where there +is heavy internal fragmentation and zspool compaction is unable to relocate +objects and release zspages. In these cases, it is recommended to decrease +the limit on the size of the zspage chains (as specified by the +CONFIG_ZSMALLOC_CHAIN_SIZE option). diff --git a/mm/Kconfig b/mm/Kconfig index 39df30dcabe3..83b1d278b31c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -191,6 +191,25 @@ config ZSMALLOC_STAT information to userspace via debugfs. If unsure, say N. +config ZSMALLOC_CHAIN_SIZE + int "Maximum number of physical pages per-zspage" + default 4 + range 4 16 + depends on ZSMALLOC + help + This option sets the upper limit on the number of physical pages + that a zmalloc page (zspage) can consist of. The optimal zspage + chain size is calculated for each size class during the + initialization of the pool. + + Changing this option can alter the characteristics of size classes, + such as the number of pages per zspage and the number of objects + per zspage. This can also result in different configurations of + the pool, as zsmalloc merges size classes with similar + characteristics. + + For more information, see zsmalloc documentation. + menu "SLAB allocator options" choice diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7b904f9bed70..3aed46ab7e6c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -73,13 +73,6 @@ */ #define ZS_ALIGN 8 -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* @@ -136,10 +129,13 @@ #define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 -#define ISOLATED_BITS 3 +#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) + /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) From b46402fa894f88ddb42a2e841618a8f42f57d16d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 18 Jan 2023 09:52:10 +0900 Subject: [PATCH 285/505] zsmalloc: set default zspage chain size to 8 This changes key characteristics (pages per-zspage and objects per-zspage) of a number of size classes which in results in different pool configuration. With zspage chain size of 8 we have more size clases clusters (123) and higher huge size class watermark (3632 bytes). Please read zsmalloc documentation for more details. Link: https://lkml.kernel.org/r/20230118005210.2814763-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 83b1d278b31c..4751031f3f05 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -193,7 +193,7 @@ config ZSMALLOC_STAT config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" - default 4 + default 8 range 4 16 depends on ZSMALLOC help From 04bac040bc71b4b37550eed5854f34ca161756f9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 18 Jan 2023 09:40:39 -0800 Subject: [PATCH 286/505] mm/hugetlb: convert get_hwpoison_huge_page() to folios Straightforward conversion of get_hwpoison_huge_page() to get_hwpoison_hugetlb_folio(). Reduces two references to a head page in memory-failure.c [arnd@arndb.de: fix get_hwpoison_hugetlb_folio() stub] Link: https://lkml.kernel.org/r/20230119111920.635260-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230118174039.14247-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Arnd Bergmann Acked-by: Naoya Horiguchi Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 10 +++++----- mm/memory-failure.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cf60fe741c1d..a51e6daacac6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -172,7 +172,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); @@ -418,7 +418,7 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 291ad4cb02f9..0f9df0143772 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7268,18 +7268,18 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); - if (PageHeadHuge(page)) { + if (folio_test_hugetlb(folio)) { *hugetlb = true; - if (HPageFreed(page)) + if (folio_test_hugetlb_freed(folio)) ret = 0; - else if (HPageMigratable(page) || unpoison) - ret = get_page_unless_zero(page); + else if (folio_test_hugetlb_migratable(folio) || unpoison) + ret = folio_try_get(folio); else ret = -EBUSY; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 38f84bff8bdf..0a382191737f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1257,28 +1257,28 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) static int __get_hwpoison_page(struct page *page, unsigned long flags) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, false); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); if (hugetlb) return ret; /* - * This check prevents from calling get_page_unless_zero() for any - * unsupported type of page in order to reduce the risk of unexpected - * races caused by taking a page refcount. + * This check prevents from calling folio_try_get() for any + * unsupported type of folio in order to reduce the risk of unexpected + * races caused by taking a folio refcount. */ - if (!HWPoisonHandlable(head, flags)) + if (!HWPoisonHandlable(&folio->page, flags)) return -EBUSY; - if (get_page_unless_zero(head)) { - if (head == compound_head(page)) + if (folio_try_get(folio)) { + if (folio == page_folio(page)) return 1; pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); - put_page(head); + folio_put(folio); } return 0; @@ -1347,11 +1347,11 @@ out: static int __get_unpoison_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, true); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); if (hugetlb) return ret; From 5649d113ffce9f532a9ecc5ab96a93e02efbf283 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 18 Jan 2023 20:13:03 +0800 Subject: [PATCH 287/505] swap_state: update shadow_nodes for anonymous page Shadow_nodes is for shadow nodes reclaiming of workingset handling, it is updated when page cache add or delete since long time ago workingset only supported page cache. But when workingset supports anonymous page detection, we missied updating shadow nodes for it. This caused that shadow nodes of anonymous page will never be reclaimd by scan_shadow_nodes() even they use much memory and system memory is tense. So update shadow_nodes of anonymous page when swap cache is add or delete by calling xas_set_update(..workingset_update_node). Link: https://lkml.kernel.org/r/202301182013032211005@zte.com.cn Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU") Signed-off-by: Yang Yang Reviewed-by: Ran Xiaokai Cc: Bagas Sanjaya Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/xarray.h | 3 ++- mm/swap_state.c | 6 ++++++ mm/workingset.c | 21 +++++++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bc..741703b45f61 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1643,7 +1643,8 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. - * This is advanced functionality and is only needed by the page cache. + * This is advanced functionality and is only needed by the page + * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cb9aaa00951d..7a003d8abb37 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -94,6 +94,8 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, unsigned long i, nr = folio_nr_pages(folio); void *old; + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); @@ -145,6 +147,8 @@ void __delete_from_swap_cache(struct folio *folio, pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); @@ -252,6 +256,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); + xas_set_update(&xas, workingset_update_node); + xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) diff --git a/mm/workingset.c b/mm/workingset.c index f194d13beabb..00c6f4d9d9be 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -657,11 +657,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - if (!spin_trylock(&mapping->host->i_lock)) { - xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); - ret = LRU_RETRY; - goto out; + /* For page cache we need to hold i_lock */ + if (mapping->host != NULL) { + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } } list_lru_isolate(lru, item); @@ -683,9 +686,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, out_invalid: xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); + if (mapping->host != NULL) { + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } ret = LRU_REMOVED_RETRY; out: cond_resched(); From 148aa87e4f631e98d926d006604116fd2b2f3a93 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Wed, 18 Jan 2023 17:05:23 +0900 Subject: [PATCH 288/505] mm/cma: fix potential memory loss on cma_declare_contiguous_nid Suppose memblock_alloc_range_nid() with highmem_start succeeds when cma_declare_contiguous_nid is called with !fixed on a 32-bit system with PHYS_ADDR_T_64BIT enabled with memblock.bottom_up == false. But the next trial to memblock_alloc_range_nid() to allocate in [SIZE_4G, limits) nullifies former successfully allocated addr and it retries memblock_alloc_ragne_nid(). In this situation, the first successfully allocated address area is lost. Change the order of allocation (SIZE_4G, high_memory and base) and check whether the allocated succeeded to prevent potential memory loss. Link: https://lkml.kernel.org/r/20230118080523.44522-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Cc: Laurent Pinchart Cc: Marek Szyprowski Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index a75b17b03b66..a7263aa02c92 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -321,18 +321,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } else { phys_addr_t addr = 0; - /* - * All pages in the reserved area must come from the same zone. - * If the requested region crosses the low/high memory boundary, - * try allocating from high memory first and fall back to low - * memory in case of failure. - */ - if (base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, true); - limit = highmem_start; - } - /* * If there is enough memory, try a bottom-up allocation first. * It will place the new cma area close to the start of the node @@ -350,6 +338,18 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } #endif + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (!addr && base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range_nid(size, alignment, + highmem_start, limit, nid, true); + limit = highmem_start; + } + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); From d0634a622be35df2dfa80dc14ee1482ae1889cb2 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Tue, 17 Jan 2023 21:54:03 -0500 Subject: [PATCH 289/505] Documentation: mm: use `s/higmem/highmem/` fix typo for highmem We should use highmem replace higmem. Link: https://lkml.kernel.org/r/20230118025403.1531-1-wangdeming@inspur.com Signed-off-by: Deming Wang Reviewed-by: Ira Weiny Cc: "Fabio M. De Francesco" Cc: Jonathan Corbet Cc: Mike Rapoport (IBM) Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- Documentation/mm/highmem.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index e691a06fb337..4503868b0865 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -83,7 +83,7 @@ list shows them in order of preference of use. for pages which are known to not come from ZONE_HIGHMEM. However, it is always safe to use kmap_local_page() / kunmap_local(). - While it is significantly faster than kmap(), for the higmem case it + While it is significantly faster than kmap(), for the highmem case it comes with restrictions about the pointers validity. Contrary to kmap() mappings, the local mappings are only valid in the context of the caller and cannot be handed to other contexts. This implies that users must From e6d2c436ff693869e83a65e61643b922e193e162 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Mon, 16 Jan 2023 19:49:21 -0300 Subject: [PATCH 290/505] tools/mm: allow users to provide additional cflags/ldflags Right now there is no way to provide additional cflags/ldflags when building tools/vm binaries. And using eg. make CFLAGS= will override the CFLAGS being set in the Makefile, making the build fail since it requires the include of the ../lib dir (for libapi). This change then allows you to specify: CFLAGS= LDFLAGS= make V=1 -C tools/vm And the options will be correctly appended as can be seen from the make output. Link: https://lkml.kernel.org/r/20230116224921.4106324-1-herton@redhat.com Signed-off-by: Herton R. Krzesinski Cc: Don Zickus Cc: Justin Forbes Cc: Vlastimil Babka Cc: Scott Weaver Signed-off-by: Andrew Morton --- tools/mm/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 9860622cbb15..6c1da51f4177 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -8,8 +8,8 @@ TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) +CFLAGS += -Wall -Wextra -I../lib/ +LDFLAGS += $(LIBS) all: $(TARGETS) From b507808ebce23561d4ff8c2aa1fb949fe402bc61 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 19 Jan 2023 16:03:43 +0000 Subject: [PATCH 291/505] mm: implement memory-deny-write-execute as a prctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)", v2. The background to this is that systemd has a configuration option called MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter. Its aim is to prevent a user task from inadvertently creating an executable mapping that is (or was) writeable. Since such BPF filter is stateless, it cannot detect mappings that were previously writeable but subsequently changed to read-only. Therefore the filter simply rejects any mprotect(PROT_EXEC). The side-effect is that on arm64 with BTI support (Branch Target Identification), the dynamic loader cannot change an ELF section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect(). For libraries, it can resort to unmapping and re-mapping but for the main executable it does not have a file descriptor. The original bug report in the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries - [4]. This series adds in-kernel support for this feature as a prctl PR_SET_MDWE, that is inherited on fork(). The prctl denies PROT_WRITE | PROT_EXEC mappings. Like the systemd BPF filter it also denies adding PROT_EXEC to mappings. However unlike the BPF filter it only denies it if the mapping didn't previous have PROT_EXEC. This allows to PROT_EXEC -> PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF filter. This patch (of 2): The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com Signed-off-by: Joey Gouly Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Alexander Viro Cc: Jeremy Linton Cc: Kees Cook Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Shuah Khan Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mman.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/sched/coredump.h | 6 +++++- include/uapi/linux/prctl.h | 6 ++++++ kernel/sys.c | 33 +++++++++++++++++++++++++++++++++ mm/mmap.c | 10 ++++++++++ mm/mprotect.c | 5 +++++ 6 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a3..cee1e4b566d8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7ae14c..0e17ae7fbfd3 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a5e06dcbba13..1312a137f7fb 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 66 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf0e886..b3cab94545ed 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/mm/mmap.c b/mm/mmap.c index 335ba3df9898..ffc0815cd7fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2669,6 +2669,16 @@ cannot_expand: vma_set_anonymous(vma); } + if (map_deny_write_exec(vma, vma->vm_flags)) { + error = -EACCES; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + /* Allow architectures to sanity-check the vm_flags */ if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; diff --git a/mm/mprotect.c b/mm/mprotect.c index 6ecdf0671b81..6a22f3ad9b84 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; From 4cf1fe34fd18b752ae2403927277715d4444f331 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Jan 2023 16:03:44 +0000 Subject: [PATCH 292/505] kselftest: vm: add tests for memory-deny-write-execute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add some tests to cover the new PR_SET_MDWE prctl. Link: https://lkml.kernel.org/r/20230119160344.54358-3-joey.gouly@arm.com Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Signed-off-by: Kees Cook Cc: Shuah Khan Cc: Alexander Viro Cc: Catalin Marinas Cc: Jeremy Linton Cc: Lennart Poettering Cc: Mark Brown Cc: nd Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Zbigniew Jędrzejewski-Szmek Cc: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/mdwe_test.c | 197 +++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 tools/testing/selftests/mm/mdwe_test.c diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 0a44d77f8437..d90cdc06aa59 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -60,6 +60,7 @@ TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_PROGS += ksm_functional_tests +TEST_GEN_PROGS += mdwe_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c new file mode 100644 index 000000000000..f466a099f1bf --- /dev/null +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifdef __aarch64__ +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#ifndef __aarch64__ +# define PROT_BTI 0 +#endif + +TEST(prctl_flags) +{ + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + + EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); +} + +FIXTURE(mdwe) +{ + void *p; + int flags; + size_t size; + pid_t pid; +}; + +FIXTURE_VARIANT(mdwe) +{ + bool enabled; + bool forked; +}; + +FIXTURE_VARIANT_ADD(mdwe, stock) +{ + .enabled = false, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, enabled) +{ + .enabled = true, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, forked) +{ + .enabled = true, + .forked = true, +}; + +FIXTURE_SETUP(mdwe) +{ + int ret, status; + + self->p = NULL; + self->flags = MAP_SHARED | MAP_ANONYMOUS; + self->size = getpagesize(); + + if (!variant->enabled) + return; + + ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L); + ASSERT_EQ(ret, 0) { + TH_LOG("PR_SET_MDWE failed or unsupported"); + } + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, 1); + + if (variant->forked) { + self->pid = fork(); + ASSERT_GE(self->pid, 0) { + TH_LOG("fork failed\n"); + } + + if (self->pid > 0) { + ret = waitpid(self->pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + exit(WEXITSTATUS(status)); + } + } +} + +FIXTURE_TEARDOWN(mdwe) +{ + if (self->p && self->p != MAP_FAILED) + munmap(self->p, self->size); +} + +TEST_F(mdwe, mmap_READ_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + EXPECT_NE(self->p, MAP_FAILED); +} + +TEST_F(mdwe, mmap_WRITE_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); + if (variant->enabled) { + EXPECT_EQ(self->p, MAP_FAILED); + } else { + EXPECT_NE(self->p, MAP_FAILED); + } +} + +TEST_F(mdwe, mprotect_stay_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + EXPECT_EQ(ret, 0); +} + +TEST_F(mdwe, mprotect_add_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mprotect_WRITE_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mmap_FIXED) +{ + void *p, *p2; + + p2 = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC, + self->flags | MAP_FIXED, 0, 0); + if (variant->enabled) { + EXPECT_EQ(p, MAP_FAILED); + } else { + EXPECT_EQ(p, self->p); + } +} + +TEST_F(mdwe, arm64_BTI) +{ + int ret; + +#ifdef __aarch64__ + if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI)) +#endif + SKIP(return, "HWCAP2_BTI not supported"); + + self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN From 6061e740822530a4ef443548b19c4e0bc6342c7a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 18 Jan 2023 23:01:10 -0500 Subject: [PATCH 293/505] mm/kmemleak: simplify kmemleak_cond_resched() usage Patch series "mm/kmemleak: Simplify kmemleak_cond_resched() & fix UAF", v2. It was found that a KASAN use-after-free error was reported in the kmemleak_scan() function. After further examination, it is believe that even though a reference is taken from the current object, it does not prevent the object pointed to by the next pointer from going away after a cond_resched(). To fix that, additional flags are added to make sure that the current object won't be removed from the object_list during the duration of the cond_resched() to ensure the validity of the next pointer. While making the change, I also simplify the current usage of kmemleak_cond_resched() to make it easier to understand. This patch (of 2): The presence of a pinned argument and the 64k loop count make kmemleak_cond_resched() a bit more complex to read. The pinned argument is used only by first kmemleak_scan() loop. Simplify the usage of kmemleak_cond_resched() by removing the pinned argument and always do a get_object()/put_object() sequence. In addition, the 64k loop is removed by using need_resched() to decide if kmemleak_cond_resched() should be called. Link: https://lkml.kernel.org/r/20230119040111.350923-1-longman@redhat.com Link: https://lkml.kernel.org/r/20230119040111.350923-2-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 55dc8b8b0616..69327b71fcf9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1472,22 +1472,17 @@ static void scan_gray_list(void) /* * Conditionally call resched() in an object iteration loop while making sure * that the given object won't go away without RCU read lock by performing a - * get_object() if !pinned. - * - * Return: false if can't do a cond_resched() due to get_object() failure - * true otherwise + * get_object() if necessaary. */ -static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +static void kmemleak_cond_resched(struct kmemleak_object *object) { - if (!pinned && !get_object(object)) - return false; + if (!get_object(object)) + return; /* Try next object */ rcu_read_unlock(); cond_resched(); rcu_read_lock(); - if (!pinned) - put_object(object); - return true; + put_object(object); } /* @@ -1501,15 +1496,12 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop_cnt = 0; jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - bool obj_pinned = false; - raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1535,19 +1527,13 @@ static void kmemleak_scan(void) /* reset the reference count (whiten the object) */ object->count = 0; - if (color_gray(object) && get_object(object)) { + if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); - obj_pinned = true; - } raw_spin_unlock_irq(&object->lock); - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, obj_pinned)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); } rcu_read_unlock(); @@ -1614,14 +1600,9 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock @@ -1656,14 +1637,9 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock From 782e4179535971c3574c367bfaaefea8970b3e0b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 18 Jan 2023 23:01:11 -0500 Subject: [PATCH 294/505] mm/kmemleak: fix UAF bug in kmemleak_scan() Commit 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") fixes soft lockup problem in kmemleak_scan() by periodically doing a cond_resched(). It does take a reference of the current object before doing it. Unfortunately, if the object has been deleted from the object_list, the next object pointed to by its next pointer may no longer be valid after coming back from cond_resched(). This can result in use-after-free and other nasty problem. Fix this problem by adding a del_state flag into kmemleak_object structure to synchronize the object deletion process between kmemleak_cond_resched() and __remove_object() to make sure that the object remained in the object_list in the duration of the cond_resched() call. Link: https://lkml.kernel.org/r/20230119040111.350923-3-longman@redhat.com Fixes: 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 69327b71fcf9..d9b242cfdb1c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -13,11 +13,12 @@ * * The following locks and mutexes are used by kmemleak: * - * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and - * accesses to the object_tree_root (or object_phys_tree_root). The - * object_list is the main list holding the metadata (struct kmemleak_object) - * for the allocated memory blocks. The object_tree_root and object_phys_tree_root - * are red black trees used to look-up metadata based on a pointer to the + * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as + * del_state modifications and accesses to the object_tree_root (or + * object_phys_tree_root). The object_list is the main list holding the + * metadata (struct kmemleak_object) for the allocated memory blocks. + * The object_tree_root and object_phys_tree_root are red + * black trees used to look-up metadata based on a pointer to the * corresponding memory block. The object_phys_tree_root is for objects * allocated with physical address. The kmemleak_object structures are * added to the object_list and object_tree_root (or object_phys_tree_root) @@ -148,6 +149,7 @@ struct kmemleak_object { struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; + unsigned int del_state; /* deletion state */ unsigned long pointer; size_t size; /* pass surplus references to this pointer */ @@ -177,6 +179,11 @@ struct kmemleak_object { /* flag set for object allocated with physical address */ #define OBJECT_PHYS (1 << 4) +/* set when __remove_object() called */ +#define DELSTATE_REMOVED (1 << 0) +/* set to temporarily prevent deletion from object_list */ +#define DELSTATE_NO_DELETE (1 << 1) + #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 @@ -571,7 +578,9 @@ static void __remove_object(struct kmemleak_object *object) rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? &object_phys_tree_root : &object_tree_root); - list_del_rcu(&object->object_list); + if (!(object->del_state & DELSTATE_NO_DELETE)) + list_del_rcu(&object->object_list); + object->del_state |= DELSTATE_REMOVED; } /* @@ -643,6 +652,7 @@ static void __create_object(unsigned long ptr, size_t size, object->count = 0; /* white color initially */ object->jiffies = jiffies; object->checksum = 0; + object->del_state = 0; /* task information */ if (in_hardirq()) { @@ -1479,9 +1489,22 @@ static void kmemleak_cond_resched(struct kmemleak_object *object) if (!get_object(object)) return; /* Try next object */ + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + goto unlock_put; /* Object removed */ + object->del_state |= DELSTATE_NO_DELETE; + raw_spin_unlock_irq(&kmemleak_lock); + rcu_read_unlock(); cond_resched(); rcu_read_lock(); + + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + list_del_rcu(&object->object_list); + object->del_state &= ~DELSTATE_NO_DELETE; +unlock_put: + raw_spin_unlock_irq(&kmemleak_lock); put_object(object); } From 6b3f013bb90e737b06c7955571407190b4c760ce Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:29 +0000 Subject: [PATCH 295/505] mm/damon: update comments in damon.h for damon_attrs Patch series "mm/damon: misc fixes". This patchset contains three miscellaneous simple fixes for DAMON online tuning. This patch (of 3): Commit cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring attributes") moved monitoring intervals from damon_ctx to a new struct, damon_attrs, but a comment in the header file has not updated for the change. Update it. Link: https://lkml.kernel.org/r/20230119013831.1911-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230119013831.1911-2-sj@kernel.org Fixes: cbeaa77b0449 ("mm/damon/core: use a dedicated struct for monitoring attributes") Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index dfb245bb3053..d5d4d19928e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -354,10 +354,10 @@ struct damon_ctx; * users should register the low level operations for their target address * space and usecase via the &damon_ctx.ops. Then, the monitoring thread * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting - * the monitoring, @update after each &damon_ctx.ops_update_interval, and + * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each - * &damon_ctx.aggr_interval. + * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after + * each &damon_attrs.aggr_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. From 2f5bef5a590be4bf4111ee8f49d97a8613a3e980 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:30 +0000 Subject: [PATCH 296/505] mm/damon/core: update monitoring results for new monitoring attributes region->nr_accesses is the number of sampling intervals in the last aggregation interval that access to the region has found, and region->age is the number of aggregation intervals that its access pattern has maintained. Hence, the real meaning of the two fields' values is depending on current sampling and aggregation intervals. This means the values need to be updated for every sampling and/or aggregation intervals updates. As DAMON core doesn't, it is a duty of in-kernel DAMON framework applications like DAMON sysfs interface, or the userspace users. Handling it in userspace or in-kernel DAMON application is complicated, inefficient, and repetitive compared to doing the update in DAMON core. Do the update in DAMON core. Link: https://lkml.kernel.org/r/20230119013831.1911-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 2db8c53491ca..d9ef62047bf5 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -465,6 +465,76 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static unsigned int damon_age_for_new_attrs(unsigned int age, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return age * old_attrs->aggr_interval / new_attrs->aggr_interval; +} + +/* convert access ratio in bp (per 10,000) to nr_accesses */ +static unsigned int damon_accesses_bp_to_nr_accesses( + unsigned int accesses_bp, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return accesses_bp * max_nr_accesses / 10000; +} + +/* convert nr_accesses to access ratio in bp (per 10,000) */ +static unsigned int damon_nr_accesses_to_accesses_bp( + unsigned int nr_accesses, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return nr_accesses * 10000 / max_nr_accesses; +} + +static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return damon_accesses_bp_to_nr_accesses( + damon_nr_accesses_to_accesses_bp( + nr_accesses, old_attrs), + new_attrs); +} + +static void damon_update_monitoring_result(struct damon_region *r, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, + old_attrs, new_attrs); + r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); +} + +/* + * region->nr_accesses is the number of sampling intervals in the last + * aggregation interval that access to the region has found, and region->age is + * the number of aggregation intervals that its access pattern has maintained. + * For the reason, the real meaning of the two fields depend on current + * sampling interval and aggregation interval. This function updates + * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. + */ +static void damon_update_monitoring_results(struct damon_ctx *ctx, + struct damon_attrs *new_attrs) +{ + struct damon_attrs *old_attrs = &ctx->attrs; + struct damon_target *t; + struct damon_region *r; + + /* if any interval is zero, simply forgive conversion */ + if (!old_attrs->sample_interval || !old_attrs->aggr_interval || + !new_attrs->sample_interval || + !new_attrs->aggr_interval) + return; + + damon_for_each_target(t, ctx) + damon_for_each_region(r, t) + damon_update_monitoring_result( + r, old_attrs, new_attrs); +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -482,6 +552,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; + damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; return 0; } From f4c978b6594b7452ef22a2fcff376debafcf25eb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Jan 2023 01:38:31 +0000 Subject: [PATCH 297/505] mm/damon/core-test: add a test for damon_update_monitoring_results() Add a simple unit test for damon_update_monitoring_results() function. Link: https://lkml.kernel.org/r/20230119013831.1911-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 3db9b7368756..fae64d32b925 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -289,6 +289,35 @@ static void damon_test_set_regions(struct kunit *test) damon_destroy_target(t); } +static void damon_test_update_monitoring_result(struct kunit *test) +{ + struct damon_attrs old_attrs = { + .sample_interval = 10, .aggr_interval = 1000,}; + struct damon_attrs new_attrs; + struct damon_region *r = damon_new_region(3, 7); + + r->nr_accesses = 15; + r->age = 20; + + new_attrs = (struct damon_attrs){ + .sample_interval = 100, .aggr_interval = 10000,}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 1000}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 100}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 20); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -299,6 +328,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), KUNIT_CASE(damon_test_set_regions), + KUNIT_CASE(damon_test_update_monitoring_result), {}, }; From b2db9ef2c0926ba86898136704cdc757c7a5a60a Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Thu, 19 Jan 2023 09:22:24 +0800 Subject: [PATCH 298/505] mm: move KMEMLEAK's Kconfig items from lib to mm Have the kmemleak's source code and Kconfig items be in the same directory. Link: https://lkml.kernel.org/r/1674091345-14799-1-git-send-email-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Acked-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Cc: ke.wang Cc: Mirsad Goran Todorovac Cc: Nathan Chancellor Cc: Peter Zijlstra (Intel) Cc: Catalin Marinas Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 71 ---------------------------------------------- mm/Kconfig.debug | 72 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 71 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 139758854ce6..958087475edb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -743,77 +743,6 @@ config SHRINKER_DEBUG visibility into the kernel memory shrinkers subsystem. Disable it to avoid an extra memory footprint. -config HAVE_DEBUG_KMEMLEAK - bool - -config DEBUG_KMEMLEAK - bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK - select DEBUG_FS - select STACKTRACE if STACKTRACE_SUPPORT - select KALLSYMS - select CRC32 - select STACKDEPOT - select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF - help - Say Y here if you want to enable the memory leak - detector. The memory allocation/freeing is traced in a way - similar to the Boehm's conservative garbage collector, the - difference being that the orphan objects are not freed but - only shown in /sys/kernel/debug/kmemleak. Enabling this - feature will introduce an overhead to memory - allocations. See Documentation/dev-tools/kmemleak.rst for more - details. - - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances - of finding leaks due to the slab objects poisoning. - - In order to access the kmemleak file, debugfs needs to be - mounted (usually at /sys/kernel/debug). - -config DEBUG_KMEMLEAK_MEM_POOL_SIZE - int "Kmemleak memory pool size" - depends on DEBUG_KMEMLEAK - range 200 1000000 - default 16000 - help - Kmemleak must track all the memory allocations to avoid - reporting false positives. Since memory may be allocated or - freed before kmemleak is fully initialised, use a static pool - of metadata objects to track such callbacks. After kmemleak is - fully initialised, this memory pool acts as an emergency one - if slab allocations fail. - -config DEBUG_KMEMLEAK_TEST - tristate "Simple test for the kernel memory leak detector" - depends on DEBUG_KMEMLEAK && m - help - This option enables a module that explicitly leaks memory. - - If unsure, say N. - -config DEBUG_KMEMLEAK_DEFAULT_OFF - bool "Default kmemleak to off" - depends on DEBUG_KMEMLEAK - help - Say Y here to disable kmemleak by default. It can then be enabled - on the command line via kmemleak=on. - -config DEBUG_KMEMLEAK_AUTO_SCAN - bool "Enable kmemleak auto scan thread on boot up" - default y - depends on DEBUG_KMEMLEAK - help - Depending on the cpu, kmemleak scan may be cpu intensive and can - stall user tasks at times. This option enables/disables automatic - kmemleak scan at boot up. - - Say N here to disable kmemleak auto scan thread to stop automatic - scanning. Disabling this option disables automatic reporting of - memory leaks. - - If unsure, say Y. - config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL && !IA64 diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index d62f48131952..c3547a373c9c 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -207,3 +207,75 @@ config PTDUMP_DEBUGFS kernel. If in doubt, say N. + +config HAVE_DEBUG_KMEMLEAK + bool + +config DEBUG_KMEMLEAK + bool "Kernel memory leak detector" + depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK + select DEBUG_FS + select STACKTRACE if STACKTRACE_SUPPORT + select KALLSYMS + select CRC32 + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF + help + Say Y here if you want to enable the memory leak + detector. The memory allocation/freeing is traced in a way + similar to the Boehm's conservative garbage collector, the + difference being that the orphan objects are not freed but + only shown in /sys/kernel/debug/kmemleak. Enabling this + feature will introduce an overhead to memory + allocations. See Documentation/dev-tools/kmemleak.rst for more + details. + + Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances + of finding leaks due to the slab objects poisoning. + + In order to access the kmemleak file, debugfs needs to be + mounted (usually at /sys/kernel/debug). + +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" + depends on DEBUG_KMEMLEAK + range 200 1000000 + default 16000 + help + Kmemleak must track all the memory allocations to avoid + reporting false positives. Since memory may be allocated or + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. + +config DEBUG_KMEMLEAK_TEST + tristate "Simple test for the kernel memory leak detector" + depends on DEBUG_KMEMLEAK && m + help + This option enables a module that explicitly leaks memory. + + If unsure, say N. + +config DEBUG_KMEMLEAK_DEFAULT_OFF + bool "Default kmemleak to off" + depends on DEBUG_KMEMLEAK + help + Say Y here to disable kmemleak by default. It can then be enabled + on the command line via kmemleak=on. + +config DEBUG_KMEMLEAK_AUTO_SCAN + bool "Enable kmemleak auto scan thread on boot up" + default y + depends on DEBUG_KMEMLEAK + help + Depending on the cpu, kmemleak scan may be cpu intensive and can + stall user tasks at times. This option enables/disables automatic + kmemleak scan at boot up. + + Say N here to disable kmemleak auto scan thread to stop automatic + scanning. Disabling this option disables automatic reporting of + memory leaks. + + If unsure, say Y. + From 7b8144e63d84716f16a1b929e0c7e03ae5c4d5c1 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:21 +0000 Subject: [PATCH 299/505] mm: multi-gen LRU: section for working set protection Patch series "mm: multi-gen LRU: improve". This patch series improves a few MGLRU functions, collects related functions, and adds additional documentation. This patch (of 7): Add a section for working set protection in the code and the design doc. The admin doc already contains its usage. Link: https://lkml.kernel.org/r/20230118001827.1040870-1-talumbau@google.com Link: https://lkml.kernel.org/r/20230118001827.1040870-2-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 15 +++++++++++++++ mm/vmscan.c | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d8f721f98868..6e1483e70fdc 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -141,6 +141,21 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. +Working set protection +---------------------- +Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is +set, an ``lruvec`` is protected from the eviction when its oldest +generation was born within ``lru_gen_min_ttl`` milliseconds. In other +words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. + +This time-based approach has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index 394ff4962cbc..a741765896b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4475,6 +4475,10 @@ done: return true; } +/****************************************************************************** + * working set protection + ******************************************************************************/ + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; From db19a43d9b3a8876552f00f656008206ef9a5efa Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:22 +0000 Subject: [PATCH 300/505] mm: multi-gen LRU: section for rmap/PT walk feedback Add a section for lru_gen_look_around() in the code and the design doc. Link: https://lkml.kernel.org/r/20230118001827.1040870-3-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 14 ++++++++++++++ mm/vmscan.c | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index 6e1483e70fdc..bd988a142bc2 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -156,6 +156,20 @@ This time-based approach has the following advantages: and memory sizes. 2. It is more reliable because it is directly wired to the OOM killer. +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +``lru_gen_look_around()`` exploits spatial locality to reduce the +trips into the rmap. It scans the adjacent PTEs of a young PTE and +promotes hot pages. If the scan was done cacheline efficiently, it +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index a741765896b6..eb9263bf6806 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4569,6 +4569,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } } +/****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ + /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If From ccbbbb85945d8f0255aa9dbc1b617017e2294f2c Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:23 +0000 Subject: [PATCH 301/505] mm: multi-gen LRU: section for Bloom filters Move Bloom filters code into a dedicated section. Improve the design doc to explain Bloom filter usage and connection between aging and eviction in their use. Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 16 +++ mm/vmscan.c | 180 +++++++++++++++--------------- 2 files changed, 108 insertions(+), 88 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index bd988a142bc2..770b5d539856 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it adds the PMD entry pointing to the PTE table to the Bloom filter. This forms a feedback loop between the eviction and the aging. +Bloom Filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be +in the set. + +In the eviction path, specifically, in ``lru_gen_look_around()``, if a +PMD has a sufficient number of hot pages, its address is placed in the +filter. In the aging path, set membership means that the PTE range +will be scanned for young pages. + +Note that Bloom filters are probabilistic on set membership. If a test +is false positive, the cost is an additional scan of a range of PTEs, +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + Summary ------- The multi-gen LRU can be disassembled into the following parts: diff --git a/mm/vmscan.c b/mm/vmscan.c index eb9263bf6806..1be9120349f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3233,6 +3233,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } +/****************************************************************************** + * Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + /****************************************************************************** * mm_struct list ******************************************************************************/ @@ -3352,94 +3444,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif -/* - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of - * bits in a bitmap, k is the number of hash functions and n is the number of - * inserted items. - * - * Page table walkers use one of the two filters to reduce their search space. - * To get rid of non-leaf entries that no longer have enough leaf entries, the - * aging uses the double-buffering technique to flip to the other filter each - * time it produces a new generation. For non-leaf entries that have enough - * leaf entries, the aging carries them over to the next generation in - * walk_pmd_range(); the eviction also report them when walking the rmap - * in lru_gen_look_around(). - * - * For future optimizations: - * 1. It's not necessary to keep both filters all the time. The spare one can be - * freed after the RCU grace period and reallocated if needed again. - * 2. And when reallocating, it's worth scaling its size according to the number - * of inserted entries in the other filter, to reduce the memory overhead on - * small systems and false positives on large systems. - * 3. Jenkins' hash function is an alternative to Knuth's. - */ -#define BLOOM_FILTER_SHIFT 15 - -static inline int filter_gen_from_seq(unsigned long seq) -{ - return seq % NR_BLOOM_FILTERS; -} - -static void get_item_key(void *item, int *key) -{ - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); - - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); - - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); - key[1] = hash >> BLOOM_FILTER_SHIFT; -} - -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -{ - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = lruvec->mm_state.filters[gen]; - if (filter) { - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); - return; - } - - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -} - -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return; - - get_item_key(item, key); - - if (!test_bit(key[0], filter)) - set_bit(key[0], filter); - if (!test_bit(key[1], filter)) - set_bit(key[1], filter); -} - -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return true; - - get_item_key(item, key); - - return test_bit(key[0], filter) && test_bit(key[1], filter); -} - static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; From 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:24 +0000 Subject: [PATCH 302/505] mm: multi-gen LRU: section for memcg LRU Move memcg LRU code into a dedicated section. Improve the design doc to outline its architecture. Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/multigen_lru.rst | 33 +++- include/linux/mm_inline.h | 17 -- include/linux/mmzone.h | 13 +- mm/memcontrol.c | 8 +- mm/vmscan.c | 250 +++++++++++++++++------------- 5 files changed, 178 insertions(+), 143 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index 770b5d539856..5f1f6ecbb79b 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -186,9 +186,40 @@ is false positive, the cost is an additional scan of a range of PTEs, which may yield hot pages anyway. Parameters of the filter itself can control the false positive rate in the limit. +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of folios (see +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of +global reclaim, which is critical to system-wide memory overcommit in +data centers. Note that memcg LRU only applies to global reclaim. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of folios): + +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of ``max_seq`` triggers promotion, i.e., the + counterpart to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: + +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +In terms of traversing memcgs during global reclaim, it improves the +best-case complexity from O(n) to O(1) and does not affect the +worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity. + Summary ------- -The multi-gen LRU can be disassembled into the following parts: +The multi-gen LRU (of folios) can be disassembled into the following +parts: * Generations * Rmap walks diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 26dcbda07e92..de1e622dd366 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } -#ifdef CONFIG_MEMCG -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return READ_ONCE(lruvec->lrugen.seg); -} -#else -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} -#endif - static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void) return false; } -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} - static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 815c7c2edf45..977be526c939 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,15 +368,6 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -/* see the comment on MEMCG_NR_GENS */ -enum { - MEMCG_LRU_NOP, - MEMCG_LRU_HEAD, - MEMCG_LRU_TAIL, - MEMCG_LRU_OLD, - MEMCG_LRU_YOUNG, -}; - #ifdef CONFIG_LRU_GEN enum { @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); +void lru_gen_soft_reclaim(struct lruvec *lruvec); #else /* !CONFIG_MEMCG */ @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 893427aded01..17335459d8dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -476,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_tree_per_node *mctz; if (lru_gen_enabled()) { - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - - /* see the comment on MEMCG_NR_GENS */ - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); - + if (soft_limit_excess(memcg)) + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1be9120349f8..796d4ca65e97 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4705,6 +4705,148 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) mem_cgroup_unlock_pages(); } +/****************************************************************************** + * memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + +#ifdef CONFIG_MEMCG + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ + /* see the comment on MEMCG_NR_GENS */ + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} + +#else /* !CONFIG_MEMCG */ + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} + +#endif + /****************************************************************************** * the eviction ******************************************************************************/ @@ -5397,53 +5539,6 @@ done: pgdat->kswapd_failures = 0; } -#ifdef CONFIG_MEMCG -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) -{ - int seg; - int old, new; - int bin = get_random_u32_below(MEMCG_NR_BINS); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - seg = 0; - new = old = lruvec->lrugen.gen; - - /* see the comment on MEMCG_NR_GENS */ - if (op == MEMCG_LRU_HEAD) - seg = MEMCG_LRU_HEAD; - else if (op == MEMCG_LRU_TAIL) - seg = MEMCG_LRU_TAIL; - else if (op == MEMCG_LRU_OLD) - new = get_memcg_gen(pgdat->memcg_lru.seq); - else if (op == MEMCG_LRU_YOUNG) - new = get_memcg_gen(pgdat->memcg_lru.seq + 1); - else - VM_WARN_ON_ONCE(true); - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - else - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - - pgdat->memcg_lru.nr_memcgs[old]--; - pgdat->memcg_lru.nr_memcgs[new]++; - - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); -} -#endif - /****************************************************************************** * state change ******************************************************************************/ @@ -6086,67 +6181,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } -void lru_gen_online_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - int bin = get_random_u32_below(MEMCG_NR_BINS); - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = get_memcg_gen(pgdat->memcg_lru.seq); - - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); - pgdat->memcg_lru.nr_memcgs[gen]++; - - lruvec->lrugen.gen = gen; - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - -void lru_gen_offline_memcg(struct mem_cgroup *memcg) -{ - int nid; - - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(memcg, nid); - - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); - } -} - -void lru_gen_release_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = lruvec->lrugen.gen; - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - pgdat->memcg_lru.nr_memcgs[gen]--; - - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) From 37cc99979d04cca677c0ad5c0acd1149ec165d1b Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:25 +0000 Subject: [PATCH 303/505] mm: multi-gen LRU: improve lru_gen_exit_memcg() Add warnings and poison ->next. Link: https://lkml.kernel.org/r/20230118001827.1040870-6-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 796d4ca65e97..c2e6ad53447b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6168,12 +6168,17 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) int i; int nid; + VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); + lruvec->lrugen.list.next = LIST_POISON1; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(lruvec->mm_state.filters[i]); lruvec->mm_state.filters[i] = NULL; From b5ff4133617d0eced35b685da0bd0929dd9fabb7 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:26 +0000 Subject: [PATCH 304/505] mm: multi-gen LRU: improve walk_pmd_range() Improve readability of walk_pmd_range() and walk_pmd_range_locked(). Link: https://lkml.kernel.org/r/20230118001827.1040870-7-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c2e6ad53447b..ff3b4aa3c31f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3999,8 +3999,8 @@ restart: } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; pmd_t *pmd; @@ -4013,18 +4013,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ - if (*start == -1) { - *start = next; + if (*first == -1) { + *first = addr; + bitmap_zero(bitmap, MIN_LRU_BATCH); return; } - i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } - pmd = pmd_offset(pud, *start); + pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) @@ -4035,15 +4036,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area do { unsigned long pfn; struct folio *folio; - unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + /* don't round down the first address */ + addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; pfn = get_pmd_pfn(pmd[i], vma, addr); if (pfn == -1) goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4072,12 +4074,11 @@ next: arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: - *start = -1; - bitmap_zero(bitmap, MIN_LRU_BATCH); + *first = -1; } #else -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { } #endif @@ -4090,9 +4091,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long pos = -1; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4131,18 +4132,17 @@ restart: if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { if (!pmd_young(val)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) @@ -4159,7 +4159,7 @@ restart: update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); } - walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; From abf086721a2f1e6897c57796f7268df1b194c750 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 18 Jan 2023 00:18:27 +0000 Subject: [PATCH 305/505] mm: multi-gen LRU: simplify lru_gen_look_around() Update the folio generation in place with or without current->reclaim_state->mm_walk. The LRU lock is held for longer, if mm_walk is NULL and the number of folios to update is more than PAGEVEC_SIZE. This causes a measurable regression from the LRU lock contention during a microbencmark. But a tiny regression is not worth the complexity. Link: https://lkml.kernel.org/r/20230118001827.1040870-8-talumbau@google.com Signed-off-by: T.J. Alumbaugh Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 73 +++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 50 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ff3b4aa3c31f..ac51150d2d36 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4587,13 +4587,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; - pte_t *pte; unsigned long start; unsigned long end; - unsigned long addr; struct lru_gen_mm_walk *walk; int young = 0; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -4610,25 +4609,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); - end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + start = max(addr & PMD_MASK, pvmw->vma->vm_start); + end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { - if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; - else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { - start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; - end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } - pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; - rcu_read_lock(); arch_enter_lazy_mmu_mode(); + pte -= (addr - start) / PAGE_SIZE; + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -4653,56 +4655,27 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) !folio_test_swapcache(folio))) folio_mark_dirty(folio); + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + + continue; + } + old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); else if (old_gen != new_gen) - __set_bit(i, bitmap); + folio_activate(folio); } arch_leave_lazy_mmu_mode(); - rcu_read_unlock(); + mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); - - if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - folio_activate(folio); - } - return; - } - - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return; - - if (!walk) { - spin_lock_irq(&lruvec->lru_lock); - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); - } - - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - if (folio_memcg_rcu(folio) != memcg) - continue; - - old_gen = folio_update_gen(folio, new_gen); - if (old_gen < 0 || old_gen == new_gen) - continue; - - if (walk) - update_batch_size(walk, folio, old_gen, new_gen); - else - lru_gen_update_size(lruvec, folio, old_gen, new_gen); - } - - if (!walk) - spin_unlock_irq(&lruvec->lru_lock); - - mem_cgroup_unlock_pages(); } /****************************************************************************** From 44b8f8bf2438bfee3aceae4d647a7460213ff340 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:20 +0000 Subject: [PATCH 306/505] mm: memory-failure: add memory failure stats to sysfs Patch series "Introduce per NUMA node memory error statistics", v2. Background ========== In the RFC for Kernel Support of Memory Error Detection [1], one advantage of software-based scanning over hardware patrol scrubber is the ability to make statistics visible to system administrators. The statistics include 2 categories: * Memory error statistics, for example, how many memory error are encountered, how many of them are recovered by the kernel. Note these memory errors are non-fatal to kernel: during the machine check exception (MCE) handling kernel already classified MCE's severity to be unnecessary to panic (but either action required or optional). * Scanner statistics, for example how many times the scanner have fully scanned a NUMA node, how many errors are first detected by the scanner. The memory error statistics are useful to userspace and actually not specific to scanner detected memory errors, and are the focus of this patchset. Motivation ========== Memory error stats are important to userspace but insufficient in kernel today. Datacenter administrators can better monitor a machine's memory health with the visible stats. For example, while memory errors are inevitable on servers with 10+ TB memory, starting server maintenance when there are only 1~2 recovered memory errors could be overreacting; in cloud production environment maintenance usually means live migrate all the workload running on the server and this usually causes nontrivial disruption to the customer. Providing insight into the scope of memory errors on a system helps to determine the appropriate follow-up action. In addition, the kernel's existing memory error stats need to be standardized so that userspace can reliably count on their usefulness. Today kernel provides following memory error info to userspace, but they are not sufficient or have disadvantages: * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, not per NUMA node stats though * ras:memory_failure_event: only available after explicitly enabled * /dev/mcelog provides many useful info about the MCEs, but doesn't capture how memory_failure recovered memory MCEs * kernel logs: userspace needs to process log text Exposing memory error stats is also a good start for the in-kernel memory error detector. Today the data source of memory error stats are either direct memory error consumption, or hardware patrol scrubber detection (either signaled as UCNA or SRAO). Once in-kernel memory scanner is implemented, it will be the main source as it is usually configured to scan memory DIMMs constantly and faster than hardware patrol scrubber. How Implemented =============== As Naoya pointed out [2], exposing memory error statistics to userspace is useful independent of software or hardware scanner. Therefore we implement the memory error statistics independent of the in-kernel memory error detector. It exposes the following per NUMA node memory error counters: /sys/devices/system/node/node${X}/memory_failure/total /sys/devices/system/node/node${X}/memory_failure/recovered /sys/devices/system/node/node${X}/memory_failure/ignored /sys/devices/system/node/node${X}/memory_failure/failed /sys/devices/system/node/node${X}/memory_failure/delayed These counters describe how many raw pages are poisoned and after the attempted recoveries by the kernel, their resolutions: how many are recovered, ignored, failed, or delayed respectively. This approach can be easier to extend for future use cases than /proc/meminfo, trace event, and log. The following math holds for the statistics: * total = recovered + ignored + failed + delayed These memory error stats are reset during machine boot. The 1st commit introduces these sysfs entries. The 2nd commit populates memory error stats every time memory_failure attempts memory error recovery. The 3rd commit adds documentations for introduced stats. [1] https://lore.kernel.org/linux-mm/7E670362-C29E-4626-B546-26530D54F937@gmail.com/T/#mc22959244f5388891c523882e61163c6e4d703af [2] https://lore.kernel.org/linux-mm/7E670362-C29E-4626-B546-26530D54F937@gmail.com/T/#m52d8d7a333d8536bd7ce74253298858b1c0c0ac6 This patch (of 3): Today kernel provides following memory error info to userspace, but each has its own disadvantage * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, not per NUMA node stats though * ras:memory_failure_event: only available after explicitly enabled * /dev/mcelog provides many useful info about the MCEs, but doesn't capture how memory_failure recovered memory MCEs * kernel logs: userspace needs to process log text Exposes per NUMA node memory error stats as sysfs entries: /sys/devices/system/node/node${X}/memory_failure/total /sys/devices/system/node/node${X}/memory_failure/recovered /sys/devices/system/node/node${X}/memory_failure/ignored /sys/devices/system/node/node${X}/memory_failure/failed /sys/devices/system/node/node${X}/memory_failure/delayed These counters describe how many raw pages are poisoned and after the attempted recoveries by the kernel, their resolutions: how many are recovered, ignored, failed, or delayed respectively. The following math holds for the statistics: * total = recovered + ignored + failed + delayed Link: https://lkml.kernel.org/r/20230120034622.2698268-1-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20230120034622.2698268-2-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: David Rientjes Acked-by: Naoya Horiguchi Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/node.c | 3 +++ include/linux/mm.h | 5 +++++ include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++ mm/memory-failure.c | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index faf3597a96da..b46db17124f3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { &node_dev_group, #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP &arch_node_dev_group, +#endif +#ifdef CONFIG_MEMORY_FAILURE + &memory_failure_attr_group, #endif NULL }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 836b96e08a14..c9db257f09b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3455,6 +3455,11 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 977be526c939..9fb1b03b83b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1212,6 +1212,31 @@ struct deferred_split { }; #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Per NUMA node memory failure handling statistics. + */ +struct memory_failure_stats { + /* + * Number of raw pages poisoned. + * Cases not accounted: memory outside kernel control, offline page, + * arch-specific memory_failure (SGX), hwpoison_filter() filtered + * error events, and unpoison actions from hwpoison_unpoison. + */ + unsigned long total; + /* + * Recovery results of poisoned raw pages handled by memory_failure, + * in sync with mf_result. + * total = ignored + failed + delayed + recovered. + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. + */ + unsigned long ignored; + unsigned long failed; + unsigned long delayed; + unsigned long recovered; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -1357,6 +1382,9 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif +#ifdef CONFIG_MEMORY_FAILURE + struct memory_failure_stats mf_stats; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0a382191737f..44eec2e93a0b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) memblk_nr_poison_sub(pfn, i); } +/** + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. + * @_name: name of the file in the per NUMA sysfs directory. + */ +#define MF_ATTR_RO(_name) \ +static ssize_t _name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct memory_failure_stats *mf_stats = \ + &NODE_DATA(dev->id)->mf_stats; \ + return sprintf(buf, "%lu\n", mf_stats->_name); \ +} \ +static DEVICE_ATTR_RO(_name) + +MF_ATTR_RO(total); +MF_ATTR_RO(ignored); +MF_ATTR_RO(failed); +MF_ATTR_RO(delayed); +MF_ATTR_RO(recovered); + +static struct attribute *memory_failure_attr[] = { + &dev_attr_total.attr, + &dev_attr_ignored.attr, + &dev_attr_failed.attr, + &dev_attr_delayed.attr, + &dev_attr_recovered.attr, + NULL, +}; + +const struct attribute_group memory_failure_attr_group = { + .name = "memory_failure", + .attrs = memory_failure_attr, +}; + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, From 18f41fa616ee4d66c67033eb46b951bf6e1b4a12 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:21 +0000 Subject: [PATCH 307/505] mm: memory-failure: bump memory failure stats to pglist_data Right before memory_failure finishes its handling, accumulate poisoned page's resolution counters to pglist_data's memory_failure_stats, so as to update the corresponding sysfs entries. Tested: 1) Start an application to allocate memory buffer chunks 2) Convert random memory buffer addresses to physical addresses 3) Inject memory errors using EINJ at chosen physical addresses 4) Access poisoned memory buffer and recover from SIGBUS 5) Check counter values under /sys/devices/system/node/node*/memory_failure/* Link: https://lkml.kernel.org/r/20230120034622.2698268-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: David Rientjes Acked-by: Naoya Horiguchi Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44eec2e93a0b..b4b30d9b0782 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1227,6 +1227,39 @@ static struct page_state error_states[] = { #undef slab #undef reserved +static void update_per_node_mf_stats(unsigned long pfn, + enum mf_result result) +{ + int nid = MAX_NUMNODES; + struct memory_failure_stats *mf_stats = NULL; + + nid = pfn_to_nid(pfn); + if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) { + WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid); + return; + } + + mf_stats = &NODE_DATA(nid)->mf_stats; + switch (result) { + case MF_IGNORED: + ++mf_stats->ignored; + break; + case MF_FAILED: + ++mf_stats->failed; + break; + case MF_DELAYED: + ++mf_stats->delayed; + break; + case MF_RECOVERED: + ++mf_stats->recovered; + break; + default: + WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result); + break; + } + ++mf_stats->total; +} + /* * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). @@ -1237,6 +1270,9 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(pfn); + + update_per_node_mf_stats(pfn, result); + pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); From 4180887f0625af739896aaafc44ee98103ab8f71 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Fri, 20 Jan 2023 03:46:22 +0000 Subject: [PATCH 308/505] mm: memory-failure: document memory failure stats Add documentation for memory_failure's per NUMA node sysfs entries Link: https://lkml.kernel.org/r/20230120034622.2698268-4-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Naoya Horiguchi Cc: David Rientjes Cc: Kefeng Wang Cc: Tony Luck Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/ABI/stable/sysfs-devices-node | 39 +++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 8db67aa472f1..402af4b2b905 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -182,3 +182,42 @@ Date: November 2021 Contact: Jarkko Sakkinen Description: The total amount of SGX physical memory in bytes. + +What: /sys/devices/system/node/nodeX/memory_failure/total +Date: January 2023 +Contact: Jiaqi Yan +Description: + The total number of raw poisoned pages (pages containing + corrupted data due to memory errors) on a NUMA node. + +What: /sys/devices/system/node/nodeX/memory_failure/ignored +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + ignored by memory error recovery attempt, usually because + support for this type of pages is unavailable, and kernel + gives up the recovery. + +What: /sys/devices/system/node/nodeX/memory_failure/failed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + failed by memory error recovery attempt. This usually means + a key recovery operation failed. + +What: /sys/devices/system/node/nodeX/memory_failure/delayed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + delayed by memory error recovery attempt. Delayed poisoned + pages usually will be retried by kernel. + +What: /sys/devices/system/node/nodeX/memory_failure/recovered +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + recovered by memory error recovery attempt. From 05a421995503b746095d8ac93fa0ddadfc3c81bc Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Sun, 22 Jan 2023 01:50:54 +0900 Subject: [PATCH 309/505] mm/page_owner: record single timestamp value for high order allocations When allocating a high-order page, separate allocation timestamp is recorded for each sub-page resulting in different timestamp values between them. This behavior is not consistent with the behavior when recording free timestamp and caused confusion when analyzing memory dumps. Record single timestamp for the entire allocation, aligning with the behavior for free timestamps. Link: https://lkml.kernel.org/r/20230121165054.520507-1-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Mike Rapoport Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index f0553bedb39d..80dc8f4050fa 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -163,6 +163,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, { struct page_owner *page_owner; int i; + u64 ts_nsec = local_clock(); for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); @@ -172,7 +173,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->tgid = current->tgid; - page_owner->ts_nsec = local_clock(); + page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); From 2e126aa29007353f069c3bab5c2cf52d231cd3ae Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 21 Jan 2023 12:11:51 +0200 Subject: [PATCH 310/505] mm/sparse: fix "unused function 'pgdat_to_phys'" warning W=1 build with clangs complains: mm/sparse.c:347:27: warning: unused function 'pgdat_to_phys' [-Wunused-function] static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) ^ 1 warning generated. pgdat_to_phys() is only used by functions defined when CONFIG_MEMORY_HOTREMOVE=y. Move pgdat_to_phys() under #ifdef CONFIG_MEMORY_HOTREMOVE to make clang happy. Link: https://lkml.kernel.org/r/20230121101151.1703292-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reported-by: kernel test robot Link: https://lore.kernel.org/all/202301210155.1E5zABb5-lkp@intel.com Cc: Miles Chen Signed-off-by: Andrew Morton --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 2779b419ef2a..fb7aeb1899a4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -318,6 +318,7 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } +#ifdef CONFIG_MEMORY_HOTREMOVE static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA @@ -328,7 +329,6 @@ static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) #endif } -#ifdef CONFIG_MEMORY_HOTREMOVE static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) From 420ef683b5217338bc679c33fd9361b52f53a526 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Jan 2023 21:35:26 +0100 Subject: [PATCH 311/505] kasan: reset page tags properly with sampling The implementation of page_alloc poisoning sampling assumed that tag_clear_highpage resets page tags for __GFP_ZEROTAGS allocations. However, this is no longer the case since commit 70c248aca9e7 ("mm: kasan: Skip unpoisoning of user pages"). This leads to kernel crashes when MTE-enabled userspace mappings are used with Hardware Tag-Based KASAN enabled. Reset page tags for __GFP_ZEROTAGS allocations in post_alloc_hook(). Also clarify and fix related comments. [andreyknvl@google.com: update comment] Link: https://lkml.kernel.org/r/5dbd866714b4839069e2d8469ac45b60953db290.1674592780.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/24ea20c1b19c2b4b56cf9f5b354915f8dbccfc77.1674592496.git.andreyknvl@google.com Fixes: 44383cef54c0 ("kasan: allow sampling page_alloc allocations for HW_TAGS") Signed-off-by: Andrey Konovalov Reported-by: Peter Collingbourne Tested-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 717f12e83b85..5ebce58026f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2476,7 +2476,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); - bool reset_tags = !zero_tags; + bool reset_tags = true; int i; set_page_private(page, 0); @@ -2503,7 +2503,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * (which happens only when memory should be initialized as well). */ if (zero_tags) { - /* Initialize both memory and tags. */ + /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2521,14 +2521,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } else { /* * KASAN decided to exclude this allocation from being - * poisoned due to sampling. Skip poisoning as well. + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. */ SetPageSkipKASanPoison(page); } } /* - * If memory tags have not been set, reset the page tags to ensure - * page_address() dereferencing does not fault. + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. */ if (reset_tags) { for (i = 0; i != 1 << order; ++i) From c5acf1f6f0a11b8e521ad43f59b1bed27dcf34f6 Mon Sep 17 00:00:00 2001 From: Jongwoo Han Date: Thu, 26 Jan 2023 03:08:47 +0900 Subject: [PATCH 312/505] mm/gup.c: fix typo in comments Link: https://lkml.kernel.org/r/20230125180847.4542-1-jongwooo.han@gmail.com Signed-off-by: Jongwoo Han Signed-off-by: Andrew Morton --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 38ba1697dd61..c4b793385ed2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1055,7 +1055,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page + * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because From 48731c8436c68ce5597dfe72f3836bd6808bedde Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:31 +0000 Subject: [PATCH 313/505] mm, compaction: rename compact_control->rescan to finish_pageblock Patch series "Fix excessive CPU usage during compaction". Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") fixed a problem where pageblocks found by fast_find_migrateblock() were ignored. Unfortunately there were numerous bug reports complaining about high CPU usage and massive stalls once 6.1 was released. Due to the severity, the patch was reverted by Vlastimil as a short-term fix[1] to -stable. The underlying problem for each of the bugs is suspected to be the repeated scanning of the same pageblocks. This series should guarantee forward progress even with commit 7efc3b726103. More information is in the changelog for patch 4. [1] http://lore.kernel.org/r/20230113173345.9692-1-vbabka@suse.cz This patch (of 4): The rescan field was not well named albeit accurate at the time. Rename the field to finish_pageblock to indicate that the remainder of the pageblock should be scanned regardless of COMPACT_CLUSTER_MAX. The intent is that pageblocks with transient failures get marked for skipping to avoid revisiting the same pageblock. Link: https://lkml.kernel.org/r/20230125134434.18017-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 24 ++++++++++++------------ mm/internal.h | 6 +++++- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b758b00a4885..28a9596609fe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1101,12 +1101,12 @@ isolate_success_no_list: /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1171,14 +1171,14 @@ isolate_abort: } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -2372,17 +2372,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == pageblock_start_pfn(iteration_start_pfn)) { - cc->rescan = true; + cc->finish_pageblock = true; } switch (isolate_migratepages(cc)) { diff --git a/mm/internal.h b/mm/internal.h index ce462bf145b4..2d1b9fa8083e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -448,7 +448,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; From 16b3be4034316bf56a171478cf1dccdf94dede43 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:32 +0000 Subject: [PATCH 314/505] mm, compaction: check if a page has been captured before draining PCP pages If a page has been captured then draining is unnecssary so check first for a captured page. Link: https://lkml.kernel.org/r/20230125134434.18017-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 28a9596609fe..a86559910fd9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2439,6 +2439,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2457,12 +2463,6 @@ check_drain: last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: From f9d7fc1ae3349759f25903cd867ab72e6ba4a63e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:33 +0000 Subject: [PATCH 315/505] mm, compaction: finish scanning the current pageblock if requested cc->finish_pageblock is set when the current pageblock should be rescanned but fast_find_migrateblock can select an alternative block. Disable fast_find_migrateblock when the current pageblock scan should be completed. Link: https://lkml.kernel.org/r/20230125134434.18017-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index a86559910fd9..91acde906ae3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1761,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous From cfccd2e63e7e0c84c514676cffa755dd71a3b2bc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:34 +0000 Subject: [PATCH 316/505] mm, compaction: finish pageblocks on complete migration failure Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") address an issue where a pageblock selected by fast_find_migrateblock() was ignored. Unfortunately, the same fix resulted in numerous reports of khugepaged or kcompactd stalling for long periods of time or consuming 100% of CPU. Tracing showed that there was a lot of rescanning between a small subset of pageblocks because the conditions for marking the block skip are not met. The scan is not reaching the end of the pageblock because enough pages were isolated but none were migrated successfully. Eventually it circles back to the same block. Pageblock skip tracking tries to minimise both latency and excessive scanning but tracking exactly when a block is fully scanned requires an excessive amount of state. This patch forcibly rescans a pageblock when all isolated pages fail to migrate even though it could be for transient reasons such as page writeback or page dirty. This will sometimes migrate too many pages but pageblocks will be marked skip and forward progress will be made. "Usemen" from the mmtests configuration workload-usemem-stress-numa-compact was used to stress compaction. The compaction trace events were recorded using a 6.2-rc5 kernel that includes commit 7efc3b726103 and count of unique ranges were measured. The top 5 ranges were 3076 range=(0x10ca00-0x10cc00) 3076 range=(0x110a00-0x110c00) 3098 range=(0x13b600-0x13b800) 3104 range=(0x141c00-0x141e00) 11424 range=(0x11b600-0x11b800) While this workload is very different than what the bugs reported, the pattern of the same subset of blocks being repeatedly scanned is observed. At one point, *only* the range range=(0x11b600 ~ 0x11b800) was scanned for 2 seconds. 14 seconds passed between the first migration-related event and the last. With the series applied including this patch, the top 5 ranges were 1 range=(0x11607e-0x116200) 1 range=(0x116200-0x116278) 1 range=(0x116278-0x116400) 1 range=(0x116400-0x116424) 1 range=(0x116424-0x116600) Only unique ranges were scanned and the time between the first migration-related event was 0.11 milliseconds. Link: https://lkml.kernel.org/r/20230125134434.18017-5-mgorman@techsingularity.net Fixes: 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 91acde906ae3..d73578af44cc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2392,6 +2392,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2434,15 +2435,28 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } From 37f3605e5e7af7de12aeb670c5b94e5a3c8dbf74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:42 +0100 Subject: [PATCH 317/505] mm: reject vmap with VM_FLUSH_RESET_PERMS Patch series "cleanup vfree and vunmap". This little series untangles the vfree and vunmap code path a bit. This patch (of 10): VM_FLUSH_RESET_PERMS is just for use with vmalloc as it is tied to freeing the underlying pages. Link: https://lkml.kernel.org/r/20230121071051.1143058-1-hch@lst.de Link: https://lkml.kernel.org/r/20230121071051.1143058-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 428e0bee5c9c..3f3cb4875966 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2868,6 +2868,9 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) + return NULL; + /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. From f41f036b804d0d920f9b6fd3fca9489dd7afd358 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:43 +0100 Subject: [PATCH 318/505] mm: remove __vfree __vfree is a subset of vfree that just skips a few checks, and which is only used by vfree and an error cleanup path. Fold __vfree into vfree and switch the only other caller to call vfree() instead. Link: https://lkml.kernel.org/r/20230121071051.1143058-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3f3cb4875966..67fc9d7e4024 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2786,14 +2786,6 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } -static void __vfree(const void *addr) -{ - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); -} - /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -2821,8 +2813,10 @@ void vfree(const void *addr) if (!addr) return; - - __vfree(addr); + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -3089,7 +3083,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* * If not enough pages were obtained to accomplish an - * allocation request, free them via __vfree() if any. + * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, @@ -3129,7 +3123,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - __vfree(area->addr); + vfree(area->addr); return NULL; } From 01e2e8394a527644de5192f92f64e1c883a3e493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:44 +0100 Subject: [PATCH 319/505] mm: remove __vfree_deferred Fold __vfree_deferred into vfree_atomic, and call vfree_atomic early on from vfree if called from interrupt context so that the extra low-level helper can be avoided. Link: https://lkml.kernel.org/r/20230121071051.1143058-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 67fc9d7e4024..cfd796570e61 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2754,20 +2754,6 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } -static inline void __vfree_deferred(const void *addr) -{ - /* - * Use raw_cpu_ptr() because this can be called from preemptible - * context. Preemption is absolutely fine here, because the llist_add() - * implementation is lockless, so it works even if we are adding to - * another cpu's list. schedule_work() should be fine with this too. - */ - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); - - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); -} - /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -2777,13 +2763,19 @@ static inline void __vfree_deferred(const void *addr) */ void vfree_atomic(const void *addr) { - BUG_ON(in_nmi()); + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + BUG_ON(in_nmi()); kmemleak_free(addr); - if (!addr) - return; - __vfree_deferred(addr); + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + if (addr && llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } /** @@ -2805,17 +2797,16 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { - BUG_ON(in_nmi()); - - kmemleak_free(addr); - - might_sleep_if(!in_interrupt()); - - if (!addr) + if (unlikely(in_interrupt())) { + vfree_atomic(addr); return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else + } + + BUG_ON(in_nmi()); + kmemleak_free(addr); + might_sleep(); + + if (addr) __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); From 208162f42f958b37147d3c1c5f947c7c1a8b9c41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:45 +0100 Subject: [PATCH 320/505] mm: move vmalloc_init and free_work down in vmalloc.c Move these two functions around a bit to avoid forward declarations. Link: https://lkml.kernel.org/r/20230121071051.1143058-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 104 +++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 53 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfd796570e61..333228c652d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,17 +89,6 @@ struct vfree_deferred { }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); -static void __vunmap(const void *, int); - -static void free_work(struct work_struct *w) -{ - struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *t, *llnode; - - llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); -} - /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -2434,48 +2423,6 @@ static void vmap_init_free_space(void) } } -void __init vmalloc_init(void) -{ - struct vmap_area *va; - struct vm_struct *tmp; - int i; - - /* - * Create the cache for vmap_area objects. - */ - vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); - - for_each_possible_cpu(i) { - struct vmap_block_queue *vbq; - struct vfree_deferred *p; - - vbq = &per_cpu(vmap_block_queue, i); - spin_lock_init(&vbq->lock); - INIT_LIST_HEAD(&vbq->free); - p = &per_cpu(vfree_deferred, i); - init_llist_head(&p->list); - INIT_WORK(&p->wq, free_work); - } - - /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (WARN_ON_ONCE(!va)) - continue; - - va->va_start = (unsigned long)tmp->addr; - va->va_end = va->va_start + tmp->size; - va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - } - - /* - * Now we can initialize a free vmap space. - */ - vmap_init_free_space(); - vmap_initialized = true; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2754,6 +2701,15 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } +static void delayed_vfree_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); +} + /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -4209,3 +4165,45 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, delayed_vfree_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} From 5d3d31d6fb17a8eb83af50ea8a0616a3cfde3e58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:46 +0100 Subject: [PATCH 321/505] mm: call vfree instead of __vunmap from delayed_vfree_work This adds an extra, never taken, in_interrupt() branch, but will allow to cut down the maze of vfree helpers. Link: https://lkml.kernel.org/r/20230121071051.1143058-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 333228c652d1..0c0267b34afa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2707,7 +2707,7 @@ static void delayed_vfree_work(struct work_struct *w) struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); + vfree(llnode); } /** From 39e65b7f63392d70f2f6aff5f4c5c3262f49637e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:47 +0100 Subject: [PATCH 322/505] mm: move __remove_vm_area out of va_remove_mappings __remove_vm_area is the only part of va_remove_mappings that requires a vmap_area. Move the call out to the caller and only pass the vm_struct to va_remove_mappings. Link: https://lkml.kernel.org/r/20230121071051.1143058-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0c0267b34afa..003e49d0e628 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2599,18 +2599,15 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ -static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { - struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - __remove_vm_area(va); - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) return; @@ -2676,7 +2673,8 @@ static void __vunmap(const void *addr, int deallocate_pages) kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - va_remove_mappings(va, deallocate_pages); + __remove_vm_area(va); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { int i; From 75c59ce74e47d3e11aa7666f1877aa64495f7b03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:48 +0100 Subject: [PATCH 323/505] mm: use remove_vm_area in __vunmap Use the common helper to find and remove a vmap_area instead of open coding it. Link: https://lkml.kernel.org/r/20230121071051.1143058-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 003e49d0e628..bc6791a0adbe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2556,20 +2556,6 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } -static struct vm_struct *__remove_vm_area(struct vmap_area *va) -{ - struct vm_struct *vm; - - if (!va || !va->vm) - return NULL; - - vm = va->vm; - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; -} - /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2582,10 +2568,18 @@ static struct vm_struct *__remove_vm_area(struct vmap_area *va) */ struct vm_struct *remove_vm_area(const void *addr) { + struct vmap_area *va; + struct vm_struct *vm; + might_sleep(); - return __remove_vm_area( - find_unlink_vmap_area((unsigned long) addr)); + va = find_unlink_vmap_area((unsigned long)addr); + if (!va || !va->vm) + return NULL; + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2651,7 +2645,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; - struct vmap_area *va; if (!addr) return; @@ -2660,20 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - va = find_unlink_vmap_area((unsigned long)addr); - if (unlikely(!va)) { + area = remove_vm_area(addr); + if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - __remove_vm_area(va); vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { From 17d3ef432dcbe80c134f1f79e2ed1ebd1076eab1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:49 +0100 Subject: [PATCH 324/505] mm: move debug checks from __vunmap to remove_vm_area All these checks apply to the free_vm_area interface as well, so move them to the common routine. Link: https://lkml.kernel.org/r/20230121071051.1143058-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bc6791a0adbe..cb8c8cd161c8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2573,11 +2573,20 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return NULL; + va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; + + debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); + debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); + free_unmap_vmap_area(va); return vm; } @@ -2649,10 +2658,6 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)) - return; - area = remove_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", @@ -2660,11 +2665,6 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); - debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - - kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { From 79311c1fe0175941298fb362ba072514e2fe5c54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:50 +0100 Subject: [PATCH 325/505] mm: split __vunmap vunmap only needs to find and free the vmap_area and vm_strut, so open code that there and merge the rest of the code into vfree. Link: https://lkml.kernel.org/r/20230121071051.1143058-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 84 +++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb8c8cd161c8..11144a29665a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2651,45 +2651,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } -static void __vunmap(const void *addr, int deallocate_pages) -{ - struct vm_struct *area; - - if (!addr) - return; - - area = remove_vm_area(addr); - if (unlikely(!area)) { - WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", - addr); - return; - } - - vm_remove_mappings(area, deallocate_pages); - - if (deallocate_pages) { - int i; - - for (i = 0; i < area->nr_pages; i++) { - struct page *page = area->pages[i]; - - BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); - cond_resched(); - } - atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); - - kvfree(area->pages); - } - - kfree(area); -} - static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); @@ -2742,6 +2703,9 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { + struct vm_struct *vm; + int i; + if (unlikely(in_interrupt())) { vfree_atomic(addr); return; @@ -2751,8 +2715,32 @@ void vfree(const void *addr) kmemleak_free(addr); might_sleep(); - if (addr) - __vunmap(addr, 1); + if (!addr) + return; + + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + vm_remove_mappings(vm, true); + for (i = 0; i < vm->nr_pages; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + kvfree(vm->pages); + kfree(vm); } EXPORT_SYMBOL(vfree); @@ -2767,10 +2755,20 @@ EXPORT_SYMBOL(vfree); */ void vunmap(const void *addr) { + struct vm_struct *vm; + BUG_ON(in_interrupt()); might_sleep(); - if (addr) - __vunmap(addr, 0); + + if (!addr) + return; + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", + addr); + return; + } + kfree(vm); } EXPORT_SYMBOL(vunmap); From 9e5fa0ae52fc67dea86f95ea4e3909b3e10a160f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:51 +0100 Subject: [PATCH 326/505] mm: refactor va_remove_mappings Move the VM_FLUSH_RESET_PERMS to the caller and rename the function to better describe what it is doing. Link: https://lkml.kernel.org/r/20230121071051.1143058-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 11144a29665a..9b71ec3213cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2602,35 +2602,23 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* + * Flush the vm mapping and reset the direct map. + */ +static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); - int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ - if (!flush_reset) - return; - /* - * If not deallocating pages, just do the flush of the VM area and - * return. - */ - if (!deallocate_pages) { - vm_unmap_aliases(); - return; - } - - /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2725,7 +2713,8 @@ void vfree(const void *addr) return; } - vm_remove_mappings(vm, true); + if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) + vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; From 7d28631786b2333c5d48ad25172eb159aaa2945f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:30 +0100 Subject: [PATCH 327/505] mpage: stop using bdev_{read,write}_page Patch series "remove ->rw_page". This series removes the ->rw_page block_device_operation, which is an old and clumsy attempt at a simple read/write fast path for the block layer. It isn't actually used by the fastest block layer operations that we support (polled I/O through io_uring), but only used by the mpage buffered I/O helpers which are some of the slowest I/O we have and do not make any difference there at all, and zram which is a block device abused to duplicate the zram functionality. Given that zram is heavily used we need to make sure there is a good replacement for synchronous I/O, so this series adds a new flag for drivers that complete I/O synchronously and uses that flag to use on-stack bios and synchronous submission for them in the swap code. This patch (of 7): These are micro-optimizations for synchronous I/O, which do not matter compared to all the other inefficiencies in the legacy buffer_head based mpage code. Link: https://lkml.kernel.org/r/20230125133436.447864-1-hch@lst.de Link: https://lkml.kernel.org/r/20230125133436.447864-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Keith Busch Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- fs/mpage.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index b8e7975159bc..55988ea994ee 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -269,11 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) alloc_new: if (args->bio == NULL) { - if (first_hole == blocks_per_page) { - if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - &folio->page)) - goto out; - } args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) @@ -585,11 +580,6 @@ page_is_mapped: alloc_new: if (bio == NULL) { - if (first_unmapped == blocks_per_page) { - if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) - goto out; - } bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); From a8c1408f870ef5308088b02c76082136b2c514ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:31 +0100 Subject: [PATCH 328/505] mm: remove the swap_readpage return value swap_readpage always returns 0, and no caller checks the return value. [akpm@linux-foundation.org: fix void-returning swap_readpage() stub, per Keith] Link: https://lkml.kernel.org/r/20230125133436.447864-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 16 +++++----------- mm/swap.h | 8 +++----- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 905d9fcc0c96..84b348fe4c7c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,11 +444,9 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -int swap_readpage(struct page *page, bool synchronous, - struct swap_iocb **plug) +void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) { struct bio *bio; - int ret = 0; struct swap_info_struct *sis = page_swap_info(page); bool workingset = PageWorkingset(page); unsigned long pflags; @@ -480,15 +478,12 @@ int swap_readpage(struct page *page, bool synchronous, goto out; } - if (sis->flags & SWP_SYNCHRONOUS_IO) { - ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); - if (!ret) { - count_vm_event(PSWPIN); - goto out; - } + if ((sis->flags & SWP_SYNCHRONOUS_IO) && + !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { + count_vm_event(PSWPIN); + goto out; } - ret = 0; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; @@ -520,7 +515,6 @@ out: psi_memstall_leave(&pflags); } delayacct_swapin_end(); - return ret; } void __swap_read_unplug(struct swap_iocb *sio) diff --git a/mm/swap.h b/mm/swap.h index f78065c8ef52..c8fdda601751 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -8,8 +8,7 @@ /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug); +void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -64,10 +63,9 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug) +static inline void swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) { - return 0; } static inline void swap_write_unplug(struct swap_iocb *sio) { From 14bd75f57400dba0e75eaee4dcb44ac52a46253f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:32 +0100 Subject: [PATCH 329/505] mm: factor out a swap_readpage_bdev helper Split the block device case from swap_readpage into a separate helper, following the abstraction for file based swap and frontswap. Link: https://lkml.kernel.org/r/20230125133436.447864-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 68 +++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 84b348fe4c7c..872a226d15dc 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -444,44 +444,15 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +static void swap_readpage_bdev(struct page *page, bool synchronous, + struct swap_info_struct *sis) { struct bio *bio; - struct swap_info_struct *sis = page_swap_info(page); - bool workingset = PageWorkingset(page); - unsigned long pflags; - bool in_thrashing; - - VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageUptodate(page), page); - - /* - * Count submission time as memory stall and delay. When the device - * is congested, or the submitting cgroup IO-throttled, submission - * can be a significant part of overall IO time. - */ - if (workingset) { - delayacct_thrashing_start(&in_thrashing); - psi_memstall_enter(&pflags); - } - delayacct_swapin_start(); - - if (frontswap_load(page) == 0) { - SetPageUptodate(page); - unlock_page(page); - goto out; - } - - if (data_race(sis->flags & SWP_FS_OPS)) { - swap_readpage_fs(page, plug); - goto out; - } if ((sis->flags & SWP_SYNCHRONOUS_IO) && !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { count_vm_event(PSWPIN); - goto out; + return; } bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); @@ -508,8 +479,39 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } __set_current_state(TASK_RUNNING); bio_put(bio); +} + +void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +{ + struct swap_info_struct *sis = page_swap_info(page); + bool workingset = PageWorkingset(page); + unsigned long pflags; + bool in_thrashing; + + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall and delay. When the device + * is congested, or the submitting cgroup IO-throttled, submission + * can be a significant part of overall IO time. + */ + if (workingset) { + delayacct_thrashing_start(&in_thrashing); + psi_memstall_enter(&pflags); + } + delayacct_swapin_start(); + + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + } else if (data_race(sis->flags & SWP_FS_OPS)) { + swap_readpage_fs(page, plug); + } else { + swap_readpage_bdev(page, synchronous, sis); + } -out: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); From 9b4e30bd7309222f74a5198f44bd45feea024b00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:33 +0100 Subject: [PATCH 330/505] mm: use an on-stack bio for synchronous swapin Optimize the synchronous swap in case by using an on-stack bio instead of allocating one using bio_alloc. Link: https://lkml.kernel.org/r/20230125133436.447864-5-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 69 +++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 872a226d15dc..d47def70e81f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -51,10 +51,9 @@ static void end_swap_bio_write(struct bio *bio) bio_put(bio); } -static void end_swap_bio_read(struct bio *bio) +static void __end_swap_bio_read(struct bio *bio) { struct page *page = bio_first_page_all(bio); - struct task_struct *waiter = bio->bi_private; if (bio->bi_status) { SetPageError(page); @@ -62,18 +61,16 @@ static void end_swap_bio_read(struct bio *bio) pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); - goto out; + } else { + SetPageUptodate(page); } - - SetPageUptodate(page); -out: unlock_page(page); - WRITE_ONCE(bio->bi_private, NULL); +} + +static void end_swap_bio_read(struct bio *bio) +{ + __end_swap_bio_read(bio); bio_put(bio); - if (waiter) { - blk_wake_io_task(waiter); - put_task_struct(waiter); - } } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -444,7 +441,33 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -static void swap_readpage_bdev(struct page *page, bool synchronous, +static void swap_readpage_bdev_sync(struct page *page, + struct swap_info_struct *sis) +{ + struct bio_vec bv; + struct bio bio; + + if ((sis->flags & SWP_SYNCHRONOUS_IO) && + !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { + count_vm_event(PSWPIN); + return; + } + + bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); + /* + * Keep this task valid during swap readpage because the oom killer may + * attempt to access it in the page fault retry time check. + */ + get_task_struct(current); + count_vm_event(PSWPIN); + submit_bio_wait(&bio); + __end_swap_bio_read(&bio); + put_task_struct(current); +} + +static void swap_readpage_bdev_async(struct page *page, struct swap_info_struct *sis) { struct bio *bio; @@ -459,26 +482,8 @@ static void swap_readpage_bdev(struct page *page, bool synchronous, bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; bio_add_page(bio, page, thp_size(page), 0); - /* - * Keep this task valid during swap readpage because the oom killer may - * attempt to access it in the page fault retry time check. - */ - if (synchronous) { - get_task_struct(current); - bio->bi_private = current; - } count_vm_event(PSWPIN); - bio_get(bio); submit_bio(bio); - while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio->bi_private)) - break; - - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - bio_put(bio); } void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) @@ -508,8 +513,10 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); + } else if (synchronous) { + swap_readpage_bdev_sync(page, sis); } else { - swap_readpage_bdev(page, synchronous, sis); + swap_readpage_bdev_async(page, sis); } if (workingset) { From e3e2762bd3c5e02780618fc42f5b0049a3bedb30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:34 +0100 Subject: [PATCH 331/505] mm: remove the __swap_writepage return value __swap_writepage always returns 0. Link: https://lkml.kernel.org/r/20230125133436.447864-6-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 23 +++++++++-------------- mm/swap.h | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index d47def70e81f..3ba5a6e99030 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -177,11 +177,11 @@ bad_bmap: int swap_writepage(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); - int ret = 0; + int ret; if (folio_free_swap(folio)) { folio_unlock(folio); - goto out; + return 0; } /* * Arch code may have to preserve more data than just the page @@ -191,17 +191,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) if (ret) { folio_mark_dirty(folio); folio_unlock(folio); - goto out; + return ret; } if (frontswap_store(&folio->page) == 0) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); - goto out; + return 0; } - ret = __swap_writepage(&folio->page, wbc); -out: - return ret; + __swap_writepage(&folio->page, wbc); + return 0; } static inline void count_swpout_vm_event(struct page *page) @@ -288,7 +287,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) { struct swap_iocb *sio = NULL; struct swap_info_struct *sis = page_swap_info(page); @@ -325,11 +324,9 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) } if (wbc->swap_plug) *wbc->swap_plug = sio; - - return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc) +void __swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret; @@ -347,7 +344,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { count_swpout_vm_event(page); - return 0; + return; } bio = bio_alloc(sis->bdev, 1, @@ -362,8 +359,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) set_page_writeback(page); unlock_page(page); submit_bio(bio); - - return 0; } void swap_write_unplug(struct swap_iocb *sio) diff --git a/mm/swap.h b/mm/swap.h index c8fdda601751..7c033d793f15 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -17,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -int __swap_writepage(struct page *page, struct writeback_control *wbc); +void __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ From 05cda97ecb7046f4192a921741aae33b300dd628 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:35 +0100 Subject: [PATCH 332/505] mm: factor out a swap_writepage_bdev helper Split the block device case from swap_readpage into a separate helper, following the abstraction for file based swap. Link: https://lkml.kernel.org/r/20230125133436.447864-7-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/page_io.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 3ba5a6e99030..0a1a3b831344 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -326,23 +326,12 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) *wbc->swap_plug = sio; } -void __swap_writepage(struct page *page, struct writeback_control *wbc) +static void swap_writepage_bdev(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; - int ret; - struct swap_info_struct *sis = page_swap_info(page); - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), - * but that will never affect SWP_FS_OPS, so the data_race - * is safe. - */ - if (data_race(sis->flags & SWP_FS_OPS)) - return swap_writepage_fs(page, wbc); - - ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); - if (!ret) { + if (!bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc)) { count_swpout_vm_event(page); return; } @@ -361,6 +350,22 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc) submit_bio(bio); } +void __swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + swap_writepage_fs(page, wbc); + else + swap_writepage_bdev(page, wbc, sis); +} + void swap_write_unplug(struct swap_iocb *sio) { struct iov_iter from; From 3222d8c2a7f888bf38b845b125e9470b12108a4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Jan 2023 14:34:36 +0100 Subject: [PATCH 333/505] block: remove ->rw_page The ->rw_page method is a special purpose bypass of the usual bio handling path that is limited to single-page reads and writes and synchronous which causes a lot of extra code in the drivers, callers and the block layer. The only remaining user is the MM swap code. Switch that swap code to simply submit a single-vec on-stack bio an synchronously wait on it based on a newly added QUEUE_FLAG_SYNCHRONOUS flag set by the drivers that currently implement ->rw_page instead. While this touches one extra cache line and executes extra code, it simplifies the block layer and drivers and ensures that all feastures are properly supported by all drivers, e.g. right now ->rw_page bypassed cgroup writeback entirely. [akpm@linux-foundation.org: fix comment typo, per Dan] Link: https://lkml.kernel.org/r/20230125133436.447864-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Cc: Dave Jiang Cc: Ira Weiny Cc: Jens Axboe Cc: Keith Busch Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vishal Verma Signed-off-by: Andrew Morton --- block/bdev.c | 78 ----------------------------------- drivers/block/brd.c | 15 +------ drivers/block/zram/zram_drv.c | 61 +-------------------------- drivers/nvdimm/btt.c | 16 +------ drivers/nvdimm/pmem.c | 24 +---------- include/linux/blkdev.h | 12 +++--- mm/page_io.c | 53 ++++++++++++++---------- mm/swapfile.c | 2 +- 8 files changed, 44 insertions(+), 217 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index edc110d90df4..1795c7d4b99e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -304,84 +304,6 @@ out: } EXPORT_SYMBOL(thaw_bdev); -/** - * bdev_read_page() - Start reading a page from a block device - * @bdev: The device to read the page from - * @sector: The offset on the device to read the page to (need not be aligned) - * @page: The page to read - * - * On entry, the page should be locked. It will be unlocked when the page - * has been read. If the block driver implements rw_page synchronously, - * that will be true on exit from this function, but it need not be. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to read this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_read_page(struct block_device *bdev, sector_t sector, - struct page *page) -{ - const struct block_device_operations *ops = bdev->bd_disk->fops; - int result = -EOPNOTSUPP; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return result; - - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_READ); - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - -/** - * bdev_write_page() - Start writing a page to a block device - * @bdev: The device to write the page to - * @sector: The offset on the device to write the page to (need not be aligned) - * @page: The page to write - * @wbc: The writeback_control for the write - * - * On entry, the page should be locked and not currently under writeback. - * On exit, if the write started successfully, the page will be unlocked and - * under writeback. If the write failed already (eg the driver failed to - * queue the page to the device), the page will still be locked. If the - * caller is a ->writepage implementation, it will need to unlock the page. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to write this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_write_page(struct block_device *bdev, sector_t sector, - struct page *page, struct writeback_control *wbc) -{ - int result; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - - set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_WRITE); - if (result) { - end_page_writeback(page); - } else { - clean_page_buffers(page); - unlock_page(page); - } - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - /* * pseudo-fs */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 20acc4a1fd6d..37dce184eb56 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -309,23 +309,9 @@ static void brd_submit_bio(struct bio *bio) bio_endio(bio); } -static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct brd_device *brd = bdev->bd_disk->private_data; - int err; - - if (PageTransHuge(page)) - return -ENOTSUPP; - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); - page_endio(page, op_is_write(op), err); - return err; -} - static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .submit_bio = brd_submit_bio, - .rw_page = brd_rw_page, }; /* @@ -411,6 +397,7 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5d1088a645e3..25526707f607 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1453,10 +1453,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, /* Slot should be unlocked before the function call */ zram_slot_unlock(zram, index); - /* A null bio means rw_page was used, we must fallback to bio */ - if (!bio) - return -EOPNOTSUPP; - ret = zram_bvec_read_from_bdev(zram, page, index, bio, partial_io); } @@ -2081,61 +2077,6 @@ static void zram_slot_free_notify(struct block_device *bdev, zram_slot_unlock(zram, index); } -static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - int offset, ret; - u32 index; - struct zram *zram; - struct bio_vec bv; - unsigned long start_time; - - if (PageTransHuge(page)) - return -ENOTSUPP; - zram = bdev->bd_disk->private_data; - - if (!valid_io_request(zram, sector, PAGE_SIZE)) { - atomic64_inc(&zram->stats.invalid_io); - ret = -EINVAL; - goto out; - } - - index = sector >> SECTORS_PER_PAGE_SHIFT; - offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - bv.bv_page = page; - bv.bv_len = PAGE_SIZE; - bv.bv_offset = 0; - - start_time = bdev_start_io_acct(bdev->bd_disk->part0, - SECTORS_PER_PAGE, op, jiffies); - ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); - bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); -out: - /* - * If I/O fails, just return error(ie, non-zero) without - * calling page_endio. - * It causes resubmit the I/O with bio request by upper functions - * of rw_page(e.g., swap_readpage, __swap_writepage) and - * bio->bi_end_io does things to handle the error - * (e.g., SetPageError, set_page_dirty and extra works). - */ - if (unlikely(ret < 0)) - return ret; - - switch (ret) { - case 0: - page_endio(page, op_is_write(op), 0); - break; - case 1: - ret = 0; - break; - default: - WARN_ON(1); - } - return ret; -} - static void zram_destroy_comps(struct zram *zram) { u32 prio; @@ -2290,7 +2231,6 @@ static const struct block_device_operations zram_devops = { .open = zram_open, .submit_bio = zram_submit_bio, .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, .owner = THIS_MODULE }; @@ -2389,6 +2329,7 @@ static int zram_add(void) set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0297b7882e33..d5593b0dc700 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1482,20 +1482,6 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct btt *btt = bdev->bd_disk->private_data; - int rc; - - rc = btt_do_bvec(btt, NULL, page, thp_size(page), 0, op, sector); - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return rc; -} - - static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) { /* some standard values */ @@ -1508,7 +1494,6 @@ static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) static const struct block_device_operations btt_fops = { .owner = THIS_MODULE, .submit_bio = btt_submit_bio, - .rw_page = btt_rw_page, .getgeo = btt_getgeo, }; @@ -1530,6 +1515,7 @@ static int btt_blk_init(struct btt *btt) blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); if (btt_meta_size(btt)) { rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 96e6e9a5f235..ceea55f621cc 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -238,28 +238,6 @@ static void pmem_submit_bio(struct bio *bio) bio_endio(bio); } -static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - blk_status_t rc; - - if (op_is_write(op)) - rc = pmem_do_write(pmem, page, 0, sector, thp_size(page)); - else - rc = pmem_do_read(pmem, page, 0, sector, thp_size(page)); - /* - * The ->rw_page interface is subtle and tricky. The core - * retries on any error, so we can only invoke page_endio() in - * the successful completion case. Otherwise, we'll see crashes - * caused by double completion. - */ - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return blk_status_to_errno(rc); -} - /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, @@ -310,7 +288,6 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .submit_bio = pmem_submit_bio, - .rw_page = pmem_rw_page, }; static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, @@ -565,6 +542,7 @@ static int pmem_attach_disk(struct device *dev, blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43d4e073b111..c5e59965b145 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -554,6 +554,7 @@ struct request_queue { #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ +#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ @@ -1250,6 +1251,12 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_synchronous(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_SYNCHRONOUS, + &bdev_get_queue(bdev)->queue_flags); +} + static inline bool bdev_stable_writes(struct block_device *bdev) { return test_bit(QUEUE_FLAG_STABLE_WRITES, @@ -1382,7 +1389,6 @@ struct block_device_operations { unsigned int flags); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, @@ -1417,10 +1423,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int bdev_read_page(struct block_device *, sector_t, struct page *); -extern int bdev_write_page(struct block_device *, sector_t, struct page *, - struct writeback_control *); - static inline void blk_wake_io_task(struct task_struct *waiter) { /* diff --git a/mm/page_io.c b/mm/page_io.c index 0a1a3b831344..a805117f7fd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -27,7 +27,7 @@ #include #include "swap.h" -static void end_swap_bio_write(struct bio *bio) +static void __end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -48,6 +48,11 @@ static void end_swap_bio_write(struct bio *bio) ClearPageReclaim(page); } end_page_writeback(page); +} + +static void end_swap_bio_write(struct bio *bio) +{ + __end_swap_bio_write(bio); bio_put(bio); } @@ -326,16 +331,32 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) *wbc->swap_plug = sio; } -static void swap_writepage_bdev(struct page *page, +static void swap_writepage_bdev_sync(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) +{ + struct bio_vec bv; + struct bio bio; + + bio_init(&bio, sis->bdev, &bv, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); + + bio_associate_blkg_from_page(&bio, page); + count_swpout_vm_event(page); + + set_page_writeback(page); + unlock_page(page); + + submit_bio_wait(&bio); + __end_swap_bio_write(&bio); +} + +static void swap_writepage_bdev_async(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; - if (!bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc)) { - count_swpout_vm_event(page); - return; - } - bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); @@ -362,8 +383,10 @@ void __swap_writepage(struct page *page, struct writeback_control *wbc) */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(page, wbc); + else if (sis->flags & SWP_SYNCHRONOUS_IO) + swap_writepage_bdev_sync(page, wbc, sis); else - swap_writepage_bdev(page, wbc, sis); + swap_writepage_bdev_async(page, wbc, sis); } void swap_write_unplug(struct swap_iocb *sio) @@ -447,12 +470,6 @@ static void swap_readpage_bdev_sync(struct page *page, struct bio_vec bv; struct bio bio; - if ((sis->flags & SWP_SYNCHRONOUS_IO) && - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { - count_vm_event(PSWPIN); - return; - } - bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_page_sector(page); bio_add_page(&bio, page, thp_size(page), 0); @@ -472,12 +489,6 @@ static void swap_readpage_bdev_async(struct page *page, { struct bio *bio; - if ((sis->flags & SWP_SYNCHRONOUS_IO) && - !bdev_read_page(sis->bdev, swap_page_sector(page), page)) { - count_vm_event(PSWPIN); - return; - } - bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; @@ -513,7 +524,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); - } else if (synchronous) { + } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { swap_readpage_bdev_sync(page, sis); } else { swap_readpage_bdev_async(page, sis); diff --git a/mm/swapfile.c b/mm/swapfile.c index af151679d13a..888aed774fb6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3071,7 +3071,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; - if (p->bdev && p->bdev->bd_disk->fops->rw_page) + if (p->bdev && bdev_synchronous(p->bdev)) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { From 00cdf76012ab78b225345e8cf77d5391b4680b45 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:15:52 +0000 Subject: [PATCH 334/505] mm: add memcpy_from_file_folio() This is the equivalent of memcpy_from_page(). It differs in that it takes the position in a file instead of offset in a folio, it accepts the total number of bytes to be copied (instead of the number of bytes to be copied from this folio) and it returns how many bytes were copied from the folio, rather than making the caller calculate that and then checking if the caller got it right. [akpm@linux-foundation.org: fix typo in comment] Link: https://lkml.kernel.org/r/20230126201552.1681588-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: "Fabio M. De Francesco" Cc: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 29 +++++++++++++++++++++++++++++ include/linux/page-flags.h | 1 + 2 files changed, 30 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e22509420ac6..348701dae77f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -413,6 +413,35 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_file_folio - Copy some bytes from a file folio. + * @to: The destination buffer. + * @folio: The folio to copy from. + * @pos: The position in the file. + * @len: The maximum number of bytes to copy. + * + * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE + * if the folio comes from HIGHMEM, and by the size of the folio. + * + * Return: The number of bytes copied from the folio. + */ +static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, + loff_t pos, size_t len) +{ + size_t offset = offset_in_folio(folio, pos); + char *from = kmap_local_folio(folio, offset); + + if (folio_test_highmem(folio)) + len = min_t(size_t, len, PAGE_SIZE - offset); + else + len = min(len, folio_size(folio) - offset); + + memcpy(to, from, len); + kunmap_local(from); + + return len; +} + /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 69e93a0c1277..a7e3a3405520 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -531,6 +531,7 @@ PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) +#define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif From d585bdbeb79aa13b8a9bbe952d90f5252f7fe909 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:12:54 +0000 Subject: [PATCH 335/505] fs: convert writepage_t callback to pass a folio Patch series "Convert writepage_t to use a folio". More folioisation. I split out the mpage work from everything else because it completely dominated the patch, but some implementations I just converted outright. This patch (of 2): We always write back an entire folio, but that's currently passed as the head page. Convert all filesystems that use write_cache_pages() to expect a folio instead of a page. Link: https://lkml.kernel.org/r/20230126201255.1681189-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230126201255.1681189-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/cifs/file.c | 8 ++++---- fs/ext4/inode.c | 4 ++-- fs/ext4/super.c | 6 +++--- fs/fuse/file.c | 18 +++++++++--------- fs/iomap/buffered-io.c | 5 ++--- fs/mpage.c | 3 ++- fs/nfs/write.c | 7 ++++--- fs/ntfs3/inode.c | 6 +++--- fs/orangefs/inode.c | 23 +++++++++++------------ include/linux/writeback.h | 2 +- mm/page-writeback.c | 6 +++--- 11 files changed, 44 insertions(+), 44 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8cdd2f67af24..162fab5a4583 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2675,14 +2675,14 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, static int cifs_writepage_locked(struct page *page, struct writeback_control *wbc); -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) +static int cifs_write_one_page(struct folio *folio, + struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret; - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); + ret = cifs_writepage_locked(&folio->page, wbc); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fb6cd994e59a..98c018dcd3fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2711,10 +2711,10 @@ out: return err; } -static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, +static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { - return ext4_writepage(page, wbc); + return ext4_writepage(&folio->page, wbc); } static int ext4_do_writepages(struct mpage_da_data *mpd) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 260c1b3e3ef2..49a8942b1e51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -482,7 +482,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) * * However, we may have to redirty a page (see below.) */ -static int ext4_journalled_writepage_callback(struct page *page, +static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { @@ -490,7 +490,7 @@ static int ext4_journalled_writepage_callback(struct page *page, struct buffer_head *bh, *head; struct journal_head *jh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: @@ -509,7 +509,7 @@ static int ext4_journalled_writepage_callback(struct page *page, if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 875314ee6f59..3648747fb64d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2184,7 +2184,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return false; } -static int fuse_writepages_fill(struct page *page, +static int fuse_writepages_fill(struct folio *folio, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; @@ -2203,7 +2203,7 @@ static int fuse_writepages_fill(struct page *page, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2238,7 +2238,7 @@ static int fuse_writepages_fill(struct page *page, data->max_pages = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; @@ -2246,13 +2246,13 @@ static int fuse_writepages_fill(struct page *page, ap->num_pages = 0; wpa->inode = inode; } - set_page_writeback(page); + folio_start_writeback(folio); - copy_highpage(tmp_page, page); + copy_highpage(tmp_page, &folio->page); ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = page; + data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); @@ -2266,13 +2266,13 @@ static int fuse_writepages_fill(struct page *page, spin_lock(&fi->lock); ap->num_pages++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, page)) { + } else if (fuse_writepage_add(wpa, &folio->page)) { data->wpa = wpa; } else { - end_page_writeback(page); + folio_end_writeback(folio); } out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 356193e44cf0..292d273a2c80 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1685,10 +1685,9 @@ done: * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ -static int -iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) { - struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; struct inode *inode = folio->mapping->host; u64 end_pos, isize; diff --git a/fs/mpage.c b/fs/mpage.c index 55988ea994ee..4890369bb10a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -440,9 +440,10 @@ void clean_page_buffers(struct page *page) clean_buffers(page, ~0U); } -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, +static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { + struct page *page = &folio->page; struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = page->mapping; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..9d6432cb3f44 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -689,13 +689,14 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(&folio->page, wbc, data); if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); return ret; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6b50b6e32378..9c646615f714 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -832,7 +832,7 @@ out: return err; } -static int ntfs_resident_writepage(struct page *page, +static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; @@ -840,11 +840,11 @@ static int ntfs_resident_writepage(struct page *page, int ret; ni_lock(ni); - ret = attr_data_write_resident(ni, page); + ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); if (ret != E_NTFS_NONRESIDENT) - unlock_page(page); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..c25468974c8a 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -154,21 +154,20 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, return ret; } -static int orangefs_writepages_callback(struct page *page, - struct writeback_control *wbc, void *data) +static int orangefs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { struct orangefs_writepages *ow = data; - struct orangefs_write_range *wr; + struct orangefs_write_range *wr = folio->private; int ret; - if (!PagePrivate(page)) { - unlock_page(page); + if (!wr) { + folio_unlock(folio); /* It's not private so there's nothing to write, right? */ printk("writepages_callback not private!\n"); BUG(); return 0; } - wr = (struct orangefs_write_range *)page_private(page); ret = -1; if (ow->npages == 0) { @@ -176,7 +175,7 @@ static int orangefs_writepages_callback(struct page *page, ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -188,7 +187,7 @@ static int orangefs_writepages_callback(struct page *page, } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -198,10 +197,10 @@ done: orangefs_writepages_work(ow, wbc); ow->npages = 0; } - ret = orangefs_writepage_locked(page, wbc); - mapping_set_error(page->mapping, ret); - unlock_page(page); - end_page_writeback(page); + ret = orangefs_writepage_locked(&folio->page, wbc); + mapping_set_error(folio->mapping, ret); + folio_unlock(folio); + folio_end_writeback(folio); } else { if (ow->npages == ow->maxpages) { orangefs_writepages_work(ow, wbc); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f1491b07474..46020373e155 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -366,7 +366,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, +typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 92b90d2ab513..516b1aa247e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2470,7 +2470,7 @@ continue_unlock: goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = writepage(&folio->page, wbc, data); + error = writepage(folio, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of @@ -2528,11 +2528,11 @@ continue_unlock: } EXPORT_SYMBOL(write_cache_pages); -static int writepage_cb(struct page *page, struct writeback_control *wbc, +static int writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); + int ret = mapping->a_ops->writepage(&folio->page, wbc); mapping_set_error(mapping, ret); return ret; } From 9160cffd45ee93bc20de134e4f127dac9af0cc18 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 Jan 2023 20:12:55 +0000 Subject: [PATCH 336/505] mpage: convert __mpage_writepage() to use a folio more fully This is just a conversion to the folio API. While there are some nods towards supporting multi-page folios in here, the blocks array is still sized for one page's worth of blocks, and there are other assumptions such as the blocks_per_page variable. [willy@infradead.org: fix accidentally-triggering WARN_ON_ONCE] Link: https://lkml.kernel.org/r/Y9kuaBgXf9lKJ8b0@casper.infradead.org Link: https://lkml.kernel.org/r/20230126201255.1681189-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/mpage.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index 4890369bb10a..b9b7f6dc9c37 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -443,13 +443,11 @@ void clean_page_buffers(struct page *page) static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; struct mpage_data *mpd = data; struct bio *bio = mpd->bio; - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - unsigned long end_index; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; @@ -460,13 +458,13 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; - int length; + size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + struct buffer_head *head = folio_buffers(folio); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ @@ -518,8 +516,8 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, /* * The page has no buffers: map it to disk */ - BUG_ON(!PageUptodate(page)); - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + BUG_ON(!folio_test_uptodate(folio)); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -527,7 +525,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; - map_bh.b_page = page; + map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; @@ -556,8 +554,11 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, first_unmapped = page_block; page_is_mapped: - end_index = i_size >> PAGE_SHIFT; - if (page->index >= end_index) { + /* Don't bother writing beyond EOF, truncate will discard the folio */ + if (folio_pos(folio) >= i_size) + goto confused; + length = folio_size(folio); + if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. @@ -566,11 +567,8 @@ page_is_mapped: * is zeroed when mapped, and writes to that region are not * written out to the file." */ - unsigned offset = i_size & (PAGE_SIZE - 1); - - if (page->index > end_index || !offset) - goto confused; - zero_user_segment(page, offset, PAGE_SIZE); + length = i_size - folio_pos(folio); + folio_zero_segment(folio, length, folio_size(folio)); } /* @@ -593,18 +591,18 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { + if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit(bio); goto alloc_new; } - clean_buffers(page, first_unmapped); + clean_buffers(&folio->page, first_unmapped); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(bio); if (boundary_block) { @@ -623,7 +621,7 @@ confused: /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(page, mpd->get_block, wbc); + ret = block_write_full_page(&folio->page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; From 6f74c0ec2095335158015ce29b708e775b9cea3a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 8 Feb 2023 15:08:01 +0100 Subject: [PATCH 337/505] arm/mm: fix swp type masking in __swp_entry() We're masking with the number of type bits instead of the type mask, which is obviously wrong. Link: https://lkml.kernel.org/r/39fd91e3-c93b-23c6-afc6-cbe473bb0ca9@redhat.com Fixes: 20aae9eff5ac ("arm/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE") Signed-off-by: David Hildenbrand Reported-by: Mark Brown Tested-by: Mark Brown Cc: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 2e626e6da9a3..a58ccbb406ad 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -292,7 +292,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_BITS) << __SWP_TYPE_SHIFT) | \ +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \ ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) From c643e6ebedb435bcf863001f5e69a578f2658055 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 3 Feb 2023 16:28:40 -0500 Subject: [PATCH 338/505] mm: fix memcpy_from_file_folio() integer underflow If we have a HIGHMEM system with a large folio, 'offset' may be larger than PAGE_SIZE, and so min_t will cap at 'len' instead of the intended end-of-page. That can overflow into the next page which is likely to be unmapped and fault, but could theoretically copy the wrong data. Link: https://lkml.kernel.org/r/Y919vmSrtAgsf6K3@casper.infradead.org Fixes: 00cdf76012ab ("mm: add memcpy_from_file_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: "Fabio M. De Francesco" Cc: Ira Weiny Signed-off-by: Andrew Morton --- include/linux/highmem.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 348701dae77f..b06254e76d99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -431,9 +431,10 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); - if (folio_test_highmem(folio)) + if (folio_test_highmem(folio)) { + offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); - else + } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); From e7f43ca99fc8bff2333547bb08dae20a35a23450 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:02 -0500 Subject: [PATCH 339/505] maple_tree: add mas_init() function Patch series "VMA tree type safety and remove __vma_adjust()", v4. This patchset does two things: 1. Clean up, including removal of __vma_adjust() and 2. Extends the VMA iterator API to provide type safety to the VMA operations using the maple tree, as requested by Linus [1]. It also addresses another issue of usability brought up by Linus about needing to modify the maple state within the loops. The maple state has been replaced by the VMA iterator and the iterator is now modified within the MM code so the caller should not need to worry about doing the work themselves when tree modifications occur. This brought up a potential inconsistency of the iterator state and what the user expects, so the inconsistency is addressed to keep the VMA iterator safe for use after the looping over a VMA range. This is addressed in patch 3 ("maple_tree: Reduce user error potential") and 4 ("test_maple_tree: Test modifications while iterating"). While cleaning up the state, the duplicate locking code in mm/mmap.c introduced by the maple tree has been address by abstracting it to two functions: vma_prepare() and vma_complete(). These abstractions allowed for a much simpler __vma_adjust(), which eventually leads to the removal of the __vma_adjust() function by placing the logic into the vma_merge() function itself. 1. https://lore.kernel.org/linux-mm/CAHk-=wg9WQXBGkNdKD2bqocnN73rDswuWsavBB7T-tekykEn_A@mail.gmail.com/ This patch (of 49): Add a function that will zero out the maple state struct and set some basic defaults. Link: https://lkml.kernel.org/r/20230120162650.984577-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a7bf58fd7cc6..1fadb5f5978b 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -432,6 +432,7 @@ struct ma_wr_state { .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ + .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ @@ -470,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, + unsigned long addr) +{ + memset(mas, 0, sizeof(struct ma_state)); + mas->tree = tree; + mas->index = mas->last = addr; + mas->max = ULONG_MAX; + mas->node = MAS_START; +} + /* Checks if a mas has not found anything */ static inline bool mas_is_none(struct ma_state *mas) { From 65be6f058b0eba98dc6c6f197ea9f62c9b6a519f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:03 -0500 Subject: [PATCH 340/505] maple_tree: fix potential rcu issue Ensure the node isn't dead after reading the node end. Link: https://lkml.kernel.org/r/20230120162650.984577-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1c5d3b640a24..7e3cf5b7e68b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4655,13 +4655,13 @@ static inline void *mas_next_nentry(struct ma_state *mas, pivots = ma_pivots(node, type); slots = ma_slots(node, type); mas->index = mas_safe_min(mas, pivots, mas->offset); + count = ma_data_end(node, type, pivots, mas->max); if (ma_dead_node(node)) return NULL; if (mas->index > max) return NULL; - count = ma_data_end(node, type, pivots, mas->max); if (mas->offset > count) return NULL; From 50e81c82ad947045c7ed26ddc9acb17276b653b6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:04 -0500 Subject: [PATCH 341/505] maple_tree: reduce user error potential When iterating, a user may operate on the tree and cause the maple state to be altered and left in an unintuitive state. Detect this scenario and correct it by setting to the limit and invalidating the state. Link: https://lkml.kernel.org/r/20230120162650.984577-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7e3cf5b7e68b..5804c5997598 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4732,6 +4732,11 @@ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) unsigned long last; enum maple_type mt; + if (mas->index > limit) { + mas->index = mas->last = limit; + mas_pause(mas); + return NULL; + } last = mas->last; retry: offset = mas->offset; @@ -4838,6 +4843,11 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) { void *entry; + if (mas->index < min) { + mas->index = mas->last = min; + mas_pause(mas); + return NULL; + } retry: while (likely(!mas_is_none(mas))) { entry = mas_prev_nentry(mas, min, mas->index); From 5159d64b335401fa83f18c27e2267f1eafc41bd3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:05 -0500 Subject: [PATCH 342/505] test_maple_tree: test modifications while iterating Add a testcase to ensure the iterator detects bad states on modifications and does what the user expects Link: https://lkml.kernel.org/r/20230120162650.984577-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 72 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ec847bf4dcb4..3d19b1f78d71 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1709,6 +1709,74 @@ static noinline void check_forking(struct maple_tree *mt) mtree_destroy(&newmt); } +static noinline void check_iteration(struct maple_tree *mt) +{ + int i, nr_entries = 125; + void *val; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i * 10, i * 10 + 9, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + + i = 0; + mas_lock(&mas); + mas_for_each(&mas, val, 925) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 92 */ + if (i == 92) { + mas.index = 925; + mas.last = 929; + mas_store(&mas, val); + } + i++; + } + /* Ensure mas_find() gets the next value */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 785) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite start of entry 78 */ + if (i == 78) { + mas.index = 780; + mas.last = 785; + mas_store(&mas, val); + } else { + i++; + } + } + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 765) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 76 and advance to the end */ + if (i == 76) { + mas.index = 760; + mas.last = 765; + mas_store(&mas, val); + mas_next(&mas, ULONG_MAX); + } + i++; + } + /* Make sure the next find returns the one after 765, 766-769 */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(76)); + mas_unlock(&mas); + mas_destroy(&mas); + mt_set_non_kernel(0); +} + static noinline void check_mas_store_gfp(struct maple_tree *mt) { @@ -2659,6 +2727,10 @@ static int maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_iteration(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_forking(&tree); mtree_destroy(&tree); From 1202700c3f8cc5f7e4646c3cf05ee6f7c8bc6ccf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:06 -0500 Subject: [PATCH 343/505] maple_tree: fix handle of invalidated state in mas_wr_store_setup() If an invalidated maple state is encountered during write, reset the maple state to MAS_START. This will result in a re-walk of the tree to the correct location for the write. Link: https://lore.kernel.org/all/20230107020126.1627-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20230120162650.984577-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: SeongJae Park Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5804c5997598..7c786bd5e575 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5609,6 +5609,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (unlikely(mas_is_paused(wr_mas->mas))) + mas_reset(wr_mas->mas); + if (!mas_is_start(wr_mas->mas)) { if (mas_is_none(wr_mas->mas)) { mas_reset(wr_mas->mas); From 17dc622c7b0f94e49bed030726df4db12ecaa6b5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:07 -0500 Subject: [PATCH 344/505] maple_tree: fix mas_prev() and mas_find() state handling When mas_prev() does not find anything, set the state to MAS_NONE. Handle the MAS_NONE in mas_find() like a MAS_START. Link: https://lkml.kernel.org/r/20230120162650.984577-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7c786bd5e575..5e9703189259 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4845,7 +4845,7 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) if (mas->index < min) { mas->index = mas->last = min; - mas_pause(mas); + mas->node = MAS_NONE; return NULL; } retry: @@ -5919,6 +5919,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) { /* Nothing comes before 0 */ mas->last = 0; + mas->node = MAS_NONE; return NULL; } @@ -6009,6 +6010,9 @@ void *mas_find(struct ma_state *mas, unsigned long max) mas->index = ++mas->last; } + if (unlikely(mas_is_none(mas))) + mas->node = MAS_START; + if (unlikely(mas_is_start(mas))) { /* First run or continue */ void *entry; From b62b633e048bbddef90b2e55d2e33823187b425f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:08 -0500 Subject: [PATCH 345/505] mm: expand vma iterator interface Add wrappers for the maple tree to the vma iterator. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++--- include/linux/mm_types.h | 4 +-- mm/internal.h | 64 ++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 18 +++++++++++ 4 files changed, 125 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c9db257f09b3..b977a90d9829 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -670,16 +670,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return mas_find(&vmi->mas, max); + return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * Uses vma_find() to get the first VMA when the iterator starts. + * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ - return vma_find(vmi, ULONG_MAX); + return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -692,12 +692,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) return vmi->mas.index; } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 452920467223..5ca11c6c46e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -849,9 +849,7 @@ struct vma_iterator { static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { - vmi->mas.tree = &mm->mm_mt; - vmi->mas.index = addr; - vmi->mas.node = MAS_START; + mas_init(&vmi->mas, &mm->mm_mt, addr); } struct mmu_gather; diff --git a/mm/internal.h b/mm/internal.h index 2d1b9fa8083e..ffd65248f266 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -877,4 +877,68 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index ffc0815cd7fb..db70f3e2181e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -144,6 +144,24 @@ static void remove_vma(struct vm_area_struct *vma) vm_area_free(vma); } +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. From 92fed82047d7febc83614a9579c37f1ce80442be Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:09 -0500 Subject: [PATCH 346/505] mm/mmap: convert brk to use vma iterator Use the vma iterator API for the brk() system call. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index db70f3e2181e..94a477a55109 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,10 +180,10 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -194,7 +194,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool populate; bool downgraded = false; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -242,8 +242,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) int ret; /* Search one past newbrk */ - mas_set(&mas, newbrk); - brkvma = mas_find(&mas, oldbrk); + vma_iter_init(&vmi, mm, newbrk); + brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* @@ -252,7 +252,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; goto success; @@ -270,14 +270,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ - mas_set(&mas, oldbrk); - next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); + vma_iter_init(&vmi, mm, oldbrk); + next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; - brkvma = mas_prev(&mas, mm->start_brk); + brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; @@ -2917,8 +2917,8 @@ out: } /* - * brk_munmap() - Unmap a partial vma. - * @mas: The maple tree state. + * brk_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator * @vma: The vma to be modified * @newbrk: the start of the address to unmap * @oldbrk: The end of the address to unmap @@ -2928,7 +2928,7 @@ out: * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if * possible. */ -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf) { @@ -2936,14 +2936,14 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. - * @mas: The maple tree state. + * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, @@ -2953,7 +2953,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -2980,8 +2980,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); @@ -2991,7 +2990,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - mas_store_prealloc(mas, vma); + vma_iter_store(vmi, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -3012,8 +3011,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_store_gfp(mas, vma, GFP_KERNEL)) + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; @@ -3042,7 +3040,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) int ret; bool populate; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3061,12 +3059,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; - vma = mas_prev(&mas, 0); - ret = do_brk_flags(&mas, vma, addr, len, flags); + vma = vma_prev(&vmi); + ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); From 3b9dbd5e91b11911d21effbb80d1976fb21660df Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:10 -0500 Subject: [PATCH 347/505] kernel/fork: convert forking to using the vmi iterator Avoid using the maple tree interface directly. This gains type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/fork.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..441dcec60aae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(old_vmi, oldmm, 0); + VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); + retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + for_each_vma(old_vmi, mpnt) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ - mas.index = tmp->vm_start; - mas.last = tmp->vm_end - 1; - mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; + if (vma_iter_bulk_store(&vmi, tmp)) + goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: - mas_destroy(&mas); + vma_iter_free(&vmi); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); From 79e4f2caa4401e56f8df34f658c43bacddc0ae03 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:11 -0500 Subject: [PATCH 348/505] mmap: convert vma_link() vma iterator Avoid using the maple tree interface directly. Link: https://lkml.kernel.org/r/20230120162650.984577-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 94a477a55109..17de99f31ff5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -487,10 +487,10 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; if (vma->vm_file) { @@ -498,7 +498,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_lock_write(mapping); } - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); if (mapping) { __vma_link_file(vma, mapping); From 0378c0a0e9e463b9e31b94fbbbc10f94b34225b6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:12 -0500 Subject: [PATCH 349/505] mm/mmap: remove preallocation from do_mas_align_munmap() In preparation of passing the vma state through split, the pre-allocation that occurs before the split has to be moved to after. Since the preallocation would then live right next to the store, just call store instead of preallocating. This effectively restores the potential error path of splitting and not munmap'ing which pre-dates the maple tree. Link: https://lkml.kernel.org/r/20230120162650.984577-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 17de99f31ff5..d46a798c38ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,9 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, GFP_KERNEL)) - return -ENOMEM; - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. @@ -2422,8 +2419,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* Point of no return */ - mas_set_range(mas, start, end - 1); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { @@ -2431,6 +2426,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; + mas_set_range(mas, start, end - 1); rcu_read_lock(); vma_test = mas_find(&test, end - 1); mas_for_each(mas, vma_mas, end - 1) { @@ -2440,10 +2436,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } rcu_read_unlock(); BUG_ON(count != test_count); - mas_set_range(mas, start, end - 1); } #endif - mas_store_prealloc(mas, NULL); + /* Point of no return */ + mas_set_range(mas, start, end - 1); + if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + return -ENOMEM; + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2475,7 +2474,6 @@ end_split_failed: __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: - mas_destroy(mas); return error; } From 183654ce26a5d5bd7bc11bcb02e8086f02f66d7d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:13 -0500 Subject: [PATCH 350/505] mmap: change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator Start passing the vma iterator through the mm code. This will allow for reuse of the state and cleaner invalidation if necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 77 +++++++++++++++++++++------------------------- mm/mremap.c | 6 ++-- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b977a90d9829..152a1362b800 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, diff --git a/mm/mmap.c b/mm/mmap.c index d46a798c38ab..5b83023ba6a6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2305,8 +2305,8 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, } /* - * do_mas_align_munmap() - munmap the aligned region from @start to @end. - * @mas: The maple_state, ideally set up to alter the correct tree location. + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. @@ -2317,7 +2317,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, * If @downgrade is true, check return code for potential release of the lock. */ static int -do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { @@ -2329,7 +2329,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2349,27 +2348,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - /* - * mas_pause() is not needed since mas->index needs to be set - * differently than vma->vm_end anyways. - */ error = __split_vma(mm, vma, start, 0); if (error) goto start_split_failed; - mas_set(mas, start); - vma = mas_walk(mas); + vma_iter_set(vmi, start); + vma = vma_find(vmi, end); } - prev = mas_prev(mas, 0); + prev = vma_prev(vmi); if (unlikely((!prev))) - mas_set(mas, start); + vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - mas_for_each(mas, next, end - 1) { + for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { struct vm_area_struct *split; @@ -2378,8 +2373,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (error) goto end_split_failed; - mas_set(mas, end); - split = mas_prev(mas, 0); + vma_iter_set(vmi, end); + split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) goto munmap_sidetree_failed; @@ -2401,7 +2396,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } if (!next) - next = mas_next(mas, ULONG_MAX); + next = vma_next(vmi); if (unlikely(uf)) { /* @@ -2426,10 +2421,10 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; - mas_set_range(mas, start, end - 1); + vma_iter_set(vmi, start); rcu_read_lock(); vma_test = mas_find(&test, end - 1); - mas_for_each(mas, vma_mas, end - 1) { + for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, end - 1); @@ -2439,8 +2434,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, } #endif /* Point of no return */ - mas_set_range(mas, start, end - 1); - if (mas_store_gfp(mas, NULL, GFP_KERNEL)) + vma_iter_set(vmi, start); + if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) return -ENOMEM; mm->map_count -= count; @@ -2478,8 +2473,8 @@ map_count_exceeded: } /* - * do_mas_munmap() - munmap a given range. - * @mas: The maple state + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap @@ -2493,7 +2488,7 @@ map_count_exceeded: * * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. */ -int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade) { @@ -2511,11 +2506,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, arch_unmap(mm, start, end); /* Find the first overlapping VMA */ - vma = mas_find(mas, end - 1); + vma = vma_find(vmi, end); if (!vma) return 0; - return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. @@ -2527,9 +2522,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); - return do_mas_munmap(&mas, mm, start, len, uf, false); + return do_vmi_munmap(&vmi, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2545,7 +2540,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; pgoff_t vm_pgoff; int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); + VMA_ITERATOR(vmi, mm, addr); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2563,7 +2558,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2576,8 +2571,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); + next = vma_next(&vmi); + prev = vma_prev(&vmi); if (vm_flags & VM_SPECIAL) goto cannot_expand; @@ -2605,13 +2600,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } - mas.index = addr; - mas.last = end - 1; cannot_expand: /* * Determine the object being mapped and call the appropriate @@ -2650,7 +2643,7 @@ cannot_expand: error = -EINVAL; goto close_and_free_vma; } - mas_reset(&mas); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge @@ -2706,7 +2699,7 @@ cannot_expand: goto free_vma; } - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { error = -ENOMEM; if (file) goto close_and_free_vma; @@ -2719,7 +2712,7 @@ cannot_expand: if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2780,7 +2773,7 @@ unmap_and_free_vma: vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2797,12 +2790,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2934,7 +2927,7 @@ static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true); + ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3057,7 +3050,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0); + ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index 05f90f47e149..3cc64c3f8bdb 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -978,14 +978,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_mas_munmap does all the needed commit accounting, and + * do_vmi_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; - MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); + VMA_ITERATOR(vmi, mm, addr + new_len); - retval = do_mas_munmap(&mas, mm, addr + new_len, + retval = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ if (retval == 1) { From 3c441ab7d059ebfd2535a52c001c50eac5d63886 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:14 -0500 Subject: [PATCH 351/505] mmap: convert vma_expand() to use vma iterator Use the vma iterator instead of the maple state for type safety and for consistency through the mm code. Link: https://lkml.kernel.org/r/20230120162650.984577-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5b83023ba6a6..7e406416cf47 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -527,7 +527,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) * * Returns: 0 on success */ -inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { @@ -556,7 +556,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Only handles expanding */ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); @@ -581,8 +581,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - /* Note: mas must be pointing to the expanding VMA */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); if (file) { vma_interval_tree_insert(vma, root); @@ -2600,7 +2599,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } From f2ebfe43ba6c845e70b6acbabd6c69ab74b3c52e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:15 -0500 Subject: [PATCH 352/505] mm: add temporary vma iterator versions of vma_merge(), split_vma(), and __split_vma() These wrappers are short-lived in this patch set so that each user can be converted on its own. In the end, these functions are renamed in one commit. Link: https://lkml.kernel.org/r/20230120162650.984577-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++++- mm/mmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 152a1362b800..956025940053 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2843,11 +2843,20 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); + unsigned long addr, int new_below); +extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); +extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/mmap.c b/mm/mmap.c index 7e406416cf47..894017841d5d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1091,6 +1091,25 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return res; } +struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, + struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + struct vm_area_struct *tmp; + + tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, + policy, vm_userfaultfd_ctx, anon_name); + if (tmp) + vma_iter_set(vmi, end); + + return tmp; +} + /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. @@ -2276,6 +2295,18 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, validate_mm_mt(mm); return err; } +int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = __split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} /* * Split a vma into two pieces at address 'addr', a new vma is allocated @@ -2290,6 +2321,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) +{ + int ret; + unsigned long end = vma->vm_end; + + ret = split_vma(mm, vma, addr, new_below); + if (!ret) + vma_iter_set(vmi, end); + + return ret; +} + static inline int munmap_sidetree(struct vm_area_struct *vma, struct ma_state *mas_detach) { From d60beb1f698a429825ea2c463ee9e3dc3b1a79b7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:16 -0500 Subject: [PATCH 353/505] ipc/shm: use the vma iterator for munmap calls Pass through the vma iterator to do_vmi_munmap() to handle the iterator state internally Link: https://lkml.kernel.org/r/20230120162650.984577-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- ipc/shm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index bd2fcc4d454e..1c6a6b319a49 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,10 +1810,9 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); - } + (vma->vm_file == file)) + do_vmi_munmap(&vmi, mm, vma->vm_start, + vma->vm_end - vma->vm_start, NULL, false); vma = vma_next(&vmi); } From 27b267011296e35dd5c983bf6c53b7230c78f383 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 26 Jan 2023 16:20:49 -0500 Subject: [PATCH 354/505] ipc/shm: introduce new do_vma_munmap() to munmap The shm already has the vma iterator in position for a write. do_vmi_munmap() searches for the correct position and aligns the write, so it is not the right function to use in this case. The shm VMA tree modification is similar to the brk munmap situation, the vma iterator is in position and the VMA is already known. This patch generalizes the brk munmap function do_brk_munmap() to be used for any other callers with the vma iterator already in position to munmap a VMA. Link: https://lkml.kernel.org/r/20230126212049.980501-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: Sven Schnelle Link: https://lore.kernel.org/linux-mm/yt9dh6wec21a.fsf@linux.ibm.com/ Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ ipc/shm.c | 11 ++++++----- mm/mmap.c | 38 ++++++++++++++++++-------------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 956025940053..5b5f26d6588a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2922,6 +2922,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) diff --git a/ipc/shm.c b/ipc/shm.c index 1c6a6b319a49..60e45e7045d4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1810,9 +1810,10 @@ long ksys_shmdt(char __user *shmaddr) /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) - do_vmi_munmap(&vmi, mm, vma->vm_start, - vma->vm_end - vma->vm_start, NULL, false); + (vma->vm_file == file)) { + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); + } vma = vma_next(&vmi); } diff --git a/mm/mmap.c b/mm/mmap.c index 894017841d5d..408e9cc47333 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -180,9 +180,6 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf); static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) @@ -236,7 +233,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * do_brk_munmap() may downgrade mmap_lock to read. + * do_vma_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; @@ -248,11 +245,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_brk_munmap() may downgrade the lock, so update it - * before calling do_brk_munmap(). + * do_vma_munmap() may downgrade the lock, so update it + * before calling do_vma_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&vmi, brkvma, newbrk, oldbrk, &uf); + ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true); if (ret == 1) { downgraded = true; goto success; @@ -2951,26 +2948,27 @@ out: } /* - * brk_munmap() - Unmap a full or partial vma. - * @vmi: The vma iterator - * @vma: The vma to be modified - * @newbrk: the start of the address to unmap - * @oldbrk: The end of the address to unmap + * do_vma_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator pointing at the vma + * @vma: The first vma to be munmapped + * @start: the start of the address to unmap + * @end: The end of the address to unmap * @uf: The userfaultfd list_head + * @downgrade: Attempt to downgrade or not * - * Returns: 1 on success. - * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if - * possible. + * Returns: 0 on success and not downgraded, 1 on success and downgraded. + * unmaps a VMA mapping when the vma iterator is already in position. + * Does not handle alignment. */ -static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf) +int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade) { struct mm_struct *mm = vma->vm_mm; int ret; - arch_unmap(mm, newbrk, oldbrk); - ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true); + arch_unmap(mm, start, end); + ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); validate_mm_mt(mm); return ret; } From 11a9b90274f6a50f7877a61c8e82dd3c845ff1dd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:17 -0500 Subject: [PATCH 355/505] userfaultfd: use vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 89 ++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 55 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15a5bf765d43..4334bd35984d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -883,7 +883,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); @@ -900,7 +900,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -909,13 +909,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { - mas_pause(&mas); vma = prev; } else { prev = vma; @@ -1302,7 +1301,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1344,15 +1343,11 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!mmget_not_zero(mm)) goto out; - mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + mmap_write_lock(mm); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1371,7 +1366,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1428,16 +1424,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, basic_ioctls = true; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); @@ -1458,30 +1452,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } next: /* @@ -1498,8 +1487,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1543,7 +1532,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1562,14 +1551,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1587,8 +1572,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - ret = -EINVAL; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1605,16 +1590,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); - + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1650,26 +1632,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; - mas_pause(&mas); goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret) break; - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(&vmi, mm, vma, end, 0); if (ret) break; - mas_pause(&mas); } next: /* @@ -1683,8 +1662,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); From 2286a6914c776ec34cd97e4573b1466d055cb9de Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:18 -0500 Subject: [PATCH 356/505] mm: change mprotect_fixup to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 5 ++++- include/linux/mm.h | 6 +++--- mm/mprotect.c | 47 ++++++++++++++++++++++------------------------ 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..b98647eeae9f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; + struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +813,10 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; + vma_iter_init(&vmi, mm, vma->vm_start); + tlb_gather_mmu(&tlb, mm); - ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, + ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b5f26d6588a..144ddfd65992 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2197,9 +2197,9 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. diff --git a/mm/mprotect.c b/mm/mprotect.c index 6a22f3ad9b84..39b6335b8813 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -585,9 +585,9 @@ static const struct mm_walk_ops prot_none_walk_ops = { }; int -mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags) +mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -642,7 +642,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, + *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, *pprev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); + error = vmi_split_vma(vmi, mm, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); + error = vmi_split_vma(vmi, mm, vma, end, 0); if (error) goto fail; } @@ -709,7 +709,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + struct vma_iterator vmi; start = untagged_addr(start); @@ -741,8 +741,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, current->mm, start); + vma = vma_find(&vmi, end); error = -ENOMEM; if (!vma) goto out; @@ -765,18 +765,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); - for (nstart = start ; ; ) { + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (vma->vm_start != tmp) { + error = -ENOMEM; + break; + } /* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC)) @@ -824,25 +828,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(current->mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } prot = reqprot; } tlb_finish_mmu(&tlb); + + if (vma_iter_end(&vmi) < end) + error = -ENOMEM; + out: mmap_write_unlock(current->mm); return error; From 37598f5a9d8b63b91cce0cb6bac5f6374ed1bb80 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:19 -0500 Subject: [PATCH 357/505] mlock: convert mlock to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mlock.c | 57 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b680f11879c3..0d09b9070071 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * * For vmas that pass the filters, merge/split as appropriate. */ -static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) +static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; @@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; } if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(vmi, mm, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); + ret = vmi_split_vma(vmi, mm, vma, end, 0); if (ret) goto out; } @@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; - MA_STATE(mas, ¤t->mm->mm_mt, start, start); + VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); - for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { + vm_flags_t newflags; + if (vma->vm_start != tmp) + return -ENOMEM; + + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= flags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(prev->vm_mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } } + + if (vma_iter_end(&vmi) < end) + return -ENOMEM; + return error; } @@ -658,7 +657,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; newflags |= to_add; /* Ignore errors */ - mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - mas_pause(&mas); + mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); cond_resched(); } out: From e552cdb853dab085d30d54815e044aa4836a6dc6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:20 -0500 Subject: [PATCH 358/505] coredump: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/coredump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..f27d734f3102 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1111,14 +1111,14 @@ whole: * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, +static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { if (gate_vma && (vma == gate_vma)) return NULL; - vma = mas_next(mas, ULONG_MAX); + vma = vma_next(vmi); if (vma) return vma; return gate_vma; @@ -1146,7 +1146,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int i = 0; /* @@ -1167,7 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { + while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; From f10c2abcdac4a44795fae9118eaedfe56204afda Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:21 -0500 Subject: [PATCH 359/505] mempolicy: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mempolicy.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72142fbe7652..ed68bdf980d3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,24 +787,21 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); - - vma = mas_find(&mas, end - 1); + prev = vma_prev(&vmi); + vma = vma_find(&vmi, end); if (WARN_ON(!vma)) return 0; if (start > vma->vm_start) prev = vma; - for (; vma; vma = mas_next(&mas, end - 1)) { + do { unsigned long vmstart = max(start, vma->vm_start); unsigned long vmend = min(end, vma->vm_end); @@ -813,29 +810,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } replace: err = vma_replace_policy(vma, new_pol); @@ -843,7 +834,7 @@ replace: goto out; next: prev = vma; - } + } for_each_vma_range(vmi, vma, end); out: return err; From 250cb40f0afee232d8573d7b1d8bc56d4b92f63e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:22 -0500 Subject: [PATCH 360/505] task_mmu: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Update the comments to how the vma iterator works. The vma iterator will keep track of the last vm_end and start the search from vm_end + 1. Link: https://lkml.kernel.org/r/20230120162650.984577-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a44339a77a75..a944e1816364 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -890,7 +890,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -908,7 +908,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -923,7 +923,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -948,31 +948,31 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ + /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) continue; @@ -980,8 +980,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) if (vma->vm_end > last_vma_end) smap_gather_stats(vma, &mss, last_vma_end); } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1277,7 +1276,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1297,7 +1296,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; From 214dbc4281374cbbd833edd502d0ed1fd1b0e243 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:23 -0500 Subject: [PATCH 361/505] sched: convert to vma iterator Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071..9c9950249d7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; - MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = mas_find(&mas, ULONG_MAX)) { + do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); - } + } for_each_vma(vmi, vma); out: /* From 178e22ac2078b1a7d284c7e3b4c3fbdb8e85ae99 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:24 -0500 Subject: [PATCH 362/505] madvise: use vmi iterator for __split_vma() and vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 92a3c6bd84c1..4d4471916465 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,6 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; + VMA_ITERATOR(vmi, mm, 0); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -149,8 +150,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), + *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; @@ -162,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, start, 1); + error = vmi__split_vma(&vmi, mm, vma, start, 1); if (error) return error; } @@ -170,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = __split_vma(mm, vma, end, 0); + error = vmi__split_vma(&vmi, mm, vma, end, 0); if (error) return error; } From 0c0c5bffd0a24637f1601ce15937ae38e572069c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:25 -0500 Subject: [PATCH 363/505] mmap: pass through vmi iterator to __split_vma() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 408e9cc47333..2b588b831ead 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2388,7 +2388,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = __split_vma(mm, vma, start, 0); + error = vmi__split_vma(vmi, mm, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2409,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = __split_vma(mm, next, end, 1); + error = vmi__split_vma(vmi, mm, next, end, 1); if (error) goto end_split_failed; From 076f16bf7698fae4b27030238998474a21d2233c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:26 -0500 Subject: [PATCH 364/505] mmap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2b588b831ead..8806bfbaa505 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2690,8 +2690,9 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, vma->vm_file, + vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3232,6 +3233,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); validate_mm_mt(mm); /* @@ -3247,7 +3249,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { From a27a11f92fe2bb1c02c8bba0a8315f9b7ad3b396 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:27 -0500 Subject: [PATCH 365/505] mm/mremap: use vmi version of vma_merge() Use the vma iterator so that the iterator can be invalidated or updated to avoid each caller doing so. Link: https://lkml.kernel.org/r/20230120162650.984577-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mremap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 3cc64c3f8bdb..f161516ab3c1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1018,6 +1018,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long extension_end = addr + new_len; pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); + VMA_ITERATOR(vmi, mm, extension_start); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1042,10 +1043,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vma_merge(mm, vma, extension_start, extension_end, - vma->vm_flags, vma->anon_vma, vma->vm_file, - extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vmi_vma_merge(&vmi, mm, vma, + extension_start, extension_end, + vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; From 47d9644de92c1aa9dcd791203397b161c67096ca Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:28 -0500 Subject: [PATCH 366/505] nommu: convert nommu to using the vma iterator Gain type safety in nommu by using the vma_iterator and not the maple tree directly. Link: https://lkml.kernel.org/r/20230120162650.984577-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 79 +++++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 0481922fe66e..7a52a7c37009 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -544,19 +544,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { vma->vm_mm = mm; @@ -574,13 +561,13 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } /* - * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). - * @mas: The maple state with preallocations. + * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). + * @vmi: The VMA iterator * @mm: The mm_struct * @vma: The vma to add * */ -static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, +static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *vma) { BUG_ON(!vma->vm_region); @@ -589,7 +576,7 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, mm->map_count++; /* add the VMA to the tree */ - vma_mas_store(vma, mas); + vma_iter_store(vmi, vma); } /* @@ -600,14 +587,14 @@ static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, */ static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + VMA_ITERATOR(vmi, mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; } - mas_add_vma_to_mm(&mas, mm, vma); + vmi_add_vma_to_mm(&vmi, mm, vma); return 0; } @@ -626,14 +613,15 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } } + /* * delete a VMA from its owning mm_struct and address space */ static int delete_vma_from_mm(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -641,10 +629,9 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_mas_remove(vma, &mas); + vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); return 0; } - /* * destroy a VMA record */ @@ -675,9 +662,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - return mas_walk(&mas); + return vma_iter_load(&vmi); } EXPORT_SYMBOL(find_vma); @@ -709,9 +696,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return NULL; if (vma->vm_start != addr) @@ -1062,7 +1049,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); *populate = 0; @@ -1091,8 +1078,8 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, GFP_KERNEL)) - goto error_maple_preallocate; + if (vma_iter_prealloc(&vmi)) + goto error_vma_iter_prealloc; region->vm_usage = 1; region->vm_flags = vm_flags; @@ -1234,7 +1221,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - mas_add_vma_to_mm(&mas, current->mm, vma); + vmi_add_vma_to_mm(&vmi, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1250,7 +1237,7 @@ share: error_just_free: up_write(&nommu_region_sem); error: - mas_destroy(&mas); + vma_iter_free(&vmi); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1278,7 +1265,7 @@ error_getting_region: show_free_areas(0, NULL); return -ENOMEM; -error_maple_preallocate: +error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); @@ -1344,20 +1331,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; - mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1369,10 +1354,10 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, GFP_KERNEL)) { + if (vma_iter_prealloc(vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - goto err_mas_preallocate; + goto err_vmi_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1406,13 +1391,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); - mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); - mas_store(&mas, vma); - vma_mas_store(new, &mas); + vma_iter_store(vmi, new); mm->map_count++; return 0; -err_mas_preallocate: +err_vmi_preallocate: vm_area_free(new); err_vma_dup: kmem_cache_free(vm_region_jar, region); @@ -1466,7 +1449,7 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *vma; unsigned long end; int ret = 0; @@ -1478,7 +1461,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = mas_find(&mas, end - 1); + vma = vma_find(&vmi, end); if (!vma) { static int limit; if (limit < 5) { @@ -1497,7 +1480,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = mas_next(&mas, end - 1); + vma = vma_find(&vmi, end); } while (vma); return -EINVAL; } else { @@ -1511,7 +1494,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = vmi_split_vma(&vmi, mm, vma, start, 1); if (ret < 0) return ret; } From 07f1bc5ad7983356ca79c65b22148dc5700a24e5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:29 -0500 Subject: [PATCH 367/505] nommu: pass through vma iterator to shrink_vma() Rename the function to vmi_shrink_vma() indicate it takes the vma iterator. Use the iterator to preallocate and drop the delete function. The maple tree is able to do the modification easier than the linked list and rbtree, so just clear the necessary area in the tree. add_vma_to_mm() is no longer used, so drop this function. vmi_add_vma_to_mm() is now only used once, so inline this function into do_mmap(). Link: https://lkml.kernel.org/r/20230120162650.984577-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/nommu.c | 63 +++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 7a52a7c37009..9ddeb92600d6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,44 +560,6 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } } -/* - * vmi_add_vma_to_mm() - VMA Iterator variant of add_vmi_to_mm(). - * @vmi: The VMA iterator - * @mm: The mm_struct - * @vma: The vma to add - * - */ -static void vmi_add_vma_to_mm(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma) -{ - BUG_ON(!vma->vm_region); - - setup_vma_to_mm(vma, mm); - mm->map_count++; - - /* add the VMA to the tree */ - vma_iter_store(vmi, vma); -} - -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) -{ - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (vma_iter_prealloc(&vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - vmi_add_vma_to_mm(&vmi, mm, vma); - return 0; -} - static void cleanup_vma_from_mm(struct vm_area_struct *vma) { vma->vm_mm->map_count--; @@ -1221,7 +1183,11 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - vmi_add_vma_to_mm(&vmi, current->mm, vma); + BUG_ON(!vma->vm_region); + setup_vma_to_mm(vma, current->mm); + current->mm->map_count++; + /* add the VMA to the tree */ + vma_iter_store(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1406,7 +1372,7 @@ err_vma_dup: * shrink a VMA by removing the specified chunk from either the beginning or * the end */ -static int shrink_vma(struct mm_struct *mm, +static int vmi_shrink_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long from, unsigned long to) { @@ -1414,14 +1380,19 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (delete_vma_from_mm(vma)) + if (vma_iter_prealloc(vmi)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); return -ENOMEM; - if (from > vma->vm_start) + } + + if (from > vma->vm_start) { + vma_iter_clear(vmi, from, vma->vm_end); vma->vm_end = from; - else + } else { + vma_iter_clear(vmi, vma->vm_start, to); vma->vm_start = to; - if (add_vma_to_mm(mm, vma)) - return -ENOMEM; + } /* cut the backing region down to size */ region = vma->vm_region; @@ -1498,7 +1469,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (ret < 0) return ret; } - return shrink_vma(mm, vma, start, end); + return vmi_shrink_vma(&vmi, vma, start, end); } erase_whole_vma: From 9760ebffbf5507320e0de41f5b80089bdef996a0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:30 -0500 Subject: [PATCH 368/505] mm: switch vma_merge(), split_vma(), and __split_vma to vma iterator Drop the vmi_* functions and transition all users to use the vma iterator directly. Link: https://lkml.kernel.org/r/20230120162650.984577-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 14 ++++---- include/linux/mm.h | 18 +++-------- mm/madvise.c | 6 ++-- mm/mempolicy.c | 6 ++-- mm/mlock.c | 6 ++-- mm/mmap.c | 79 +++++++++++++--------------------------------- mm/mprotect.c | 6 ++-- mm/mremap.c | 10 +++--- mm/nommu.c | 8 +++-- 9 files changed, 55 insertions(+), 98 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4334bd35984d..f3c75c6222de 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -909,7 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1452,7 +1452,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), @@ -1463,12 +1463,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } @@ -1632,7 +1632,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vmi_vma_merge(&vmi, mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); @@ -1641,12 +1641,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto next; } if (vma->vm_start < start) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { - ret = vmi_split_vma(&vmi, mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 144ddfd65992..f3b49feb5c35 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2839,24 +2839,16 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, { return __vma_adjust(vma, start, end, pgoff, insert, NULL); } -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); -extern struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/madvise.c b/mm/madvise.c index 4d4471916465..02b317726c9a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -150,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(&vmi, mm, *prev, start, end, new_flags, + *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { @@ -163,7 +163,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (start != vma->vm_start) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, start, 1); + error = __split_vma(&vmi, vma, start, 1); if (error) return error; } @@ -171,7 +171,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, if (end != vma->vm_end) { if (unlikely(mm->map_count >= sysctl_max_map_count)) return -ENOMEM; - error = vmi__split_vma(&vmi, mm, vma, end, 0); + error = __split_vma(&vmi, vma, end, 0); if (error) return error; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed68bdf980d3..dd5ca942256f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vmi_vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, + prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -819,12 +819,12 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto replace; } if (vma->vm_start != vmstart) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmstart, 1); + err = split_vma(&vmi, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { - err = vmi_split_vma(&vmi, vma->vm_mm, vma, vmend, 0); + err = split_vma(&vmi, vma, vmend, 0); if (err) goto out; } diff --git a/mm/mlock.c b/mm/mlock.c index 0d09b9070071..0336f52e03d7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -418,7 +418,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vmi_vma_merge(vmi, mm, *prev, start, end, newflags, + *prev = vma_merge(vmi, mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { @@ -427,13 +427,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - ret = vmi_split_vma(vmi, mm, vma, start, 1); + ret = split_vma(vmi, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = vmi_split_vma(vmi, mm, vma, end, 0); + ret = split_vma(vmi, vma, end, 0); if (ret) goto out; } diff --git a/mm/mmap.c b/mm/mmap.c index 8806bfbaa505..afc65f122f7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1010,7 +1010,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, @@ -1019,7 +1019,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *mid, *next, *res; + struct vm_area_struct *mid, *next, *res = NULL; int err = -1; bool merge_prev = false; bool merge_next = false; @@ -1085,26 +1085,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (err) return NULL; khugepaged_enter_vma(res, vm_flags); - return res; -} -struct vm_area_struct *vmi_vma_merge(struct vma_iterator *vmi, - struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct vm_area_struct *tmp; - - tmp = vma_merge(mm, prev, addr, end, vm_flags, anon_vma, file, pgoff, - policy, vm_userfaultfd_ctx, anon_name); - if (tmp) + if (res) vma_iter_set(vmi, end); - return tmp; + return res; } /* @@ -2228,12 +2213,14 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ -int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - validate_mm_mt(mm); + unsigned long end = vma->vm_end; + + validate_mm_mt(vma->vm_mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2273,8 +2260,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ - if (!err) + if (!err) { + vma_iter_set(vmi, end); return 0; + } /* Avoid vm accounting in close() operation */ new->vm_start = new->vm_end; @@ -2289,46 +2278,21 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); - validate_mm_mt(mm); + validate_mm_mt(vma->vm_mm); return err; } -int vmi__split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = __split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; -} /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); -} - -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) -{ - int ret; - unsigned long end = vma->vm_end; - - ret = split_vma(mm, vma, addr, new_below); - if (!ret) - vma_iter_set(vmi, end); - - return ret; + return __split_vma(vmi, vma, addr, new_below); } static inline int munmap_sidetree(struct vm_area_struct *vma, @@ -2388,7 +2352,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = vmi__split_vma(vmi, mm, vma, start, 0); + error = __split_vma(vmi, vma, start, 0); if (error) goto start_split_failed; @@ -2409,7 +2373,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (next->vm_end > end) { struct vm_area_struct *split; - error = vmi__split_vma(vmi, mm, next, end, 1); + error = __split_vma(vmi, next, end, 1); if (error) goto end_split_failed; @@ -2690,9 +2654,10 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vmi_vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, vma->vm_file, - vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, + vma->vm_file, vma->vm_pgoff, NULL, + NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3249,7 +3214,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vmi_vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 39b6335b8813..cce6a0e58fb5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -642,7 +642,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vmi_vma_merge(vmi, mm, *pprev, start, end, newflags, + *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, *pprev = vma; if (start != vma->vm_start) { - error = vmi_split_vma(vmi, mm, vma, start, 1); + error = split_vma(vmi, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = vmi_split_vma(vmi, mm, vma, end, 0); + error = split_vma(vmi, vma, end, 0); if (error) goto fail; } diff --git a/mm/mremap.c b/mm/mremap.c index f161516ab3c1..71ba8eddd836 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1043,12 +1043,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vmi_vma_merge(&vmi, mm, vma, - extension_start, extension_end, - vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + vma = vma_merge(&vmi, mm, vma, extension_start, + extension_end, vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; diff --git a/mm/nommu.c b/mm/nommu.c index 9ddeb92600d6..9a166738909e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1297,18 +1297,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int vmi_split_vma(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, int new_below) +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1465,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = vmi_split_vma(&vmi, mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } From 34403fa579514a6de378f06f79239821c92305bf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:31 -0500 Subject: [PATCH 369/505] mm/damon/vaddr-test.h: stop using vma_mas_store() for maple tree store Prepare for the removal of the vma_mas_store() function by open coding the maple tree store in this test code. Set the range of the maple state and call the store function directly. Link: https://lkml.kernel.org/r/20230120162650.984577-31-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reported-by: kernel test robot Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index bce37c487540..c4b455b5ee30 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,19 +14,26 @@ #include -static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, +static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, ssize_t nr_vmas) { - int i; + int i, ret = -ENOMEM; MA_STATE(mas, mt, 0, 0); if (!nr_vmas) - return; + return 0; mas_lock(&mas); - for (i = 0; i < nr_vmas; i++) - vma_mas_store(&vmas[i], &mas); + for (i = 0; i < nr_vmas; i++) { + mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1); + if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL)) + goto failed; + } + + ret = 0; +failed: mas_unlock(&mas); + return ret; } /* @@ -71,7 +78,8 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) }; mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); - __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); + if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) + kunit_skip(test, "Failed to create VMA tree"); __damon_va_three_regions(&mm, regions); From fbcc3104b8437cc1babf04421e8bb8181561343e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:32 -0500 Subject: [PATCH 370/505] mmap: convert __vma_adjust() to use vma iterator Use the vma iterator internally for __vma_adjust(). Avoid using the maple tree interface directly for type safety. Link: https://lkml.kernel.org/r/20230120162650.984577-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 -- mm/mmap.c | 75 ++++++++-------------------------------------- 2 files changed, 13 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f3b49feb5c35..2f62d687e9bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2856,9 +2856,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); - static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/mm/mmap.c b/mm/mmap.c index afc65f122f7d..07ba54c34bd0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -432,56 +432,6 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } -/* - * vma_mas_store() - Store a VMA in the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to store a VMA in the maple tree when the @mas has already - * walked to the correct location. - * - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_store(mas->tree, vma); - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -/* - * vma_mas_remove() - Remove a VMA from the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to remove a VMA from the maple tree when the @mas has already - * been established and points to the correct location. - * Note: the end address is inclusive in the maple tree. - */ -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - -/* - * vma_mas_szero() - Set a given range to zero. Used when modifying a - * vm_area_struct start or end. - * - * @mas: The maple tree ma_state - * @start: The start address to zero - * @end: The end address to zero. - */ -static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, - unsigned long end) -{ - trace_vma_mas_szero(mas->tree, start, end - 1); - mas_set_range(mas, start, end - 1); - mas_store_prealloc(mas, NULL); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); @@ -641,7 +591,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -726,7 +676,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (vma_iter_prealloc(&vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -772,7 +722,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_mas_szero(&mas, vma->vm_start, start); + vma_iter_clear(&vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -782,8 +732,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_mas_szero(&mas, end, vma->vm_end); - mas_reset(&mas); + vma_iter_clear(&vmi, end, vma->vm_end); + vma_iter_set(&vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -794,13 +744,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_mas_store(next, &mas); + vma_iter_store(&vmi, next); } if (file) { @@ -820,8 +770,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - mas_reset(&mas); - vma_mas_store(insert, &mas); + vma_iter_store(&vmi, insert); mm->map_count++; } @@ -867,7 +816,7 @@ again: if (insert && file) uprobe_mmap(insert); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return 0; @@ -1999,7 +1948,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, vma->vm_start, address - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2081,7 +2031,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, address, vma->vm_end - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); From 9e56044625a1f472edc278105f41a60726991d89 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:33 -0500 Subject: [PATCH 371/505] mm: pass through vma iterator to __vma_adjust() Pass the vma iterator through to __vma_adjust() so the state can be updated. Link: https://lkml.kernel.org/r/20230120162650.984577-33-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/mmap.c | 31 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f62d687e9bd..9c15f401f295 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2831,13 +2831,15 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, +extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { - return __vma_adjust(vma, start, end, pgoff, insert, NULL); + VMA_ITERATOR(vmi, vma->vm_mm, start); + + return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 07ba54c34bd0..330de1ab6a8d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -576,9 +576,9 @@ nomem: * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) +int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next_next = NULL; /* uninit var warning */ @@ -591,7 +591,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool vma_changed = false; long adjust_next = 0; int remove_next = 0; - VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { @@ -676,7 +675,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } - if (vma_iter_prealloc(&vmi)) + if (vma_iter_prealloc(vmi)) return -ENOMEM; vma_adjust_trans_huge(orig_vma, start, end, adjust_next); @@ -722,7 +721,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (start != vma->vm_start) { if ((vma->vm_start < start) && (!insert || (insert->vm_end != start))) { - vma_iter_clear(&vmi, vma->vm_start, start); + vma_iter_clear(vmi, vma->vm_start, start); VM_WARN_ON(insert && insert->vm_start > vma->vm_start); } else { vma_changed = true; @@ -732,8 +731,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (end != vma->vm_end) { if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { - vma_iter_clear(&vmi, end, vma->vm_end); - vma_iter_set(&vmi, vma->vm_end); + vma_iter_clear(vmi, end, vma->vm_end); + vma_iter_set(vmi, vma->vm_end); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); } @@ -744,13 +743,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (vma_changed) - vma_iter_store(&vmi, vma); + vma_iter_store(vmi, vma); vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(&vmi, next); + vma_iter_store(vmi, next); } if (file) { @@ -770,7 +769,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - vma_iter_store(&vmi, insert); + vma_iter_store(vmi, insert); mm->map_count++; } @@ -816,7 +815,7 @@ again: if (insert && file) uprobe_mmap(insert); - vma_iter_free(&vmi); + vma_iter_free(vmi); validate_mm(mm); return 0; @@ -1010,20 +1009,20 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, end, prev->vm_pgoff, NULL, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(prev, prev->vm_start, + err = __vma_adjust(vmi, prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); else /* cases 3, 8 */ - err = __vma_adjust(mid, addr, next->vm_end, + err = __vma_adjust(vmi, mid, addr, next->vm_end, next->vm_pgoff - pglen, NULL, next); res = next; } From 85ab779e3426dc47cc27418821e6577680b509b9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:34 -0500 Subject: [PATCH 372/505] madvise: use split_vma() instead of __split_vma() The split_vma() wrapper is specifically for this use case, so use it. [Liam.Howlett@oracle.com: fix VMA_ITERATOR start position] Link: https://lkml.kernel.org/r/20230125135809.85262-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-34-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 02b317726c9a..ca672e37b38c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,7 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - VMA_ITERATOR(vmi, mm, 0); + VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -161,17 +161,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, start, 1); + error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(&vmi, vma, end, 0); + error = split_vma(&vmi, vma, end, 0); if (error) return error; } From c465be97a4bc0022688d99f77a75b9be91843b31 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:35 -0500 Subject: [PATCH 373/505] mm: remove unnecessary write to vma iterator in __vma_adjust() If the vma start address is going to change due to an insert, then it is safe to not write the vma to the tree. The write of the insert vma will alter the tree as necessary. Link: https://lkml.kernel.org/r/20230120162650.984577-35-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 330de1ab6a8d..bd3ccfb4d9e0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -719,10 +719,12 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } if (start != vma->vm_start) { - if ((vma->vm_start < start) && - (!insert || (insert->vm_end != start))) { - vma_iter_clear(vmi, vma->vm_start, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + if (vma->vm_start < start) { + if (!insert || (insert->vm_end != start)) { + vma_iter_clear(vmi, vma->vm_start, start); + vma_iter_set(vmi, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } } else { vma_changed = true; } From 0fd5a9e2b09ff589370f2c536df77654ed2d341f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:36 -0500 Subject: [PATCH 374/505] mm: pass vma iterator through to __vma_adjust() Pass the iterator through to be used in __vma_adjust(). The state of the iterator needs to be correct for the operation that will occur so make the adjustments. Link: https://lkml.kernel.org/r/20230120162650.984577-36-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index bd3ccfb4d9e0..1bdf66b3b96e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -525,6 +525,10 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_interval_tree_remove(vma, root); } + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -2164,13 +2168,13 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the end VMA. */ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; int err; - unsigned long end = vma->vm_end; validate_mm_mt(vma->vm_mm); @@ -2206,14 +2210,17 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); + err = __vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new, NULL); else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new, NULL); /* Success. */ if (!err) { - vma_iter_set(vmi, end); + if (new_below) + vma_next(vmi); return 0; } @@ -2308,8 +2315,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto start_split_failed; - vma_iter_set(vmi, start); - vma = vma_find(vmi, end); + vma = vma_iter_load(vmi); } prev = vma_prev(vmi); @@ -2329,7 +2335,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; - vma_iter_set(vmi, end); split = vma_prev(vmi); error = munmap_sidetree(split, &mas_detach); if (error) @@ -2573,6 +2578,7 @@ cannot_expand: goto unacct_error; } + vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; vma->vm_flags = vm_flags; From b373037fa9bb374f26bbabc0779fe990d02d33b7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:37 -0500 Subject: [PATCH 375/505] mm: add vma iterator to vma_adjust() arguments Change the vma_adjust() function definition to accept the vma iterator and pass it through to __vma_adjust(). Update fs/exec to use the new vma_adjust() function parameters. Update mm/mremap to use the new vma_adjust() function parameters. Revert the __split_vma() calls back from __vma_adjust() to vma_adjust() and pass through the vma iterator. Link: https://lkml.kernel.org/r/20230120162650.984577-37-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 11 ++++------- include/linux/mm.h | 9 ++++----- mm/mmap.c | 10 +++++----- mm/mremap.c | 4 ++-- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b98647eeae9f..76ee62e1d3f1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -731,12 +731,9 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) } tlb_finish_mmu(&tlb); - /* - * Shrink the vma to just the new range. Always succeeds. - */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); - - return 0; + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c15f401f295..2e95287a9f74 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2834,12 +2834,11 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +static inline int vma_adjust(struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + pgoff_t pgoff, struct vm_area_struct *insert) { - VMA_ITERATOR(vmi, vma->vm_mm, start); - - return __vma_adjust(&vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 1bdf66b3b96e..f61e45caa32c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2210,12 +2210,12 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_ops->open(new); if (new_below) - err = __vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new, NULL); + err = vma_adjust(vmi, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), + new); else - err = __vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new, NULL); + err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, + new); /* Success. */ if (!err) { diff --git a/mm/mremap.c b/mm/mremap.c index 71ba8eddd836..2176f0cc7f9a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1047,8 +1047,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + } else if (vma_adjust(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { From cc8d1b097de78bd25b7b1ed32018b21cecbf3f6c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:38 -0500 Subject: [PATCH 376/505] mmap: clean up mmap_region() unrolling Move logic of unrolling to the error path as apposed to duplicating it within the function body. This reduces the potential of missing an update to one path when making changes. Link: https://lkml.kernel.org/r/20230120162650.984577-38-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Li Zetao Signed-off-by: Andrew Morton --- mm/mmap.c | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f61e45caa32c..95ea613a4378 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2601,12 +2601,11 @@ cannot_expand: * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. */ - if (WARN_ON((addr != vma->vm_start))) { - error = -EINVAL; + error = -EINVAL; + if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - } - vma_iter_set(&vmi, addr); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. @@ -2653,25 +2652,13 @@ cannot_expand: } /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -EINVAL; + if (!arch_validate_flags(vma->vm_flags)) + goto close_and_free_vma; - if (vma_iter_prealloc(&vmi)) { - error = -ENOMEM; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + error = -ENOMEM; + if (vma_iter_prealloc(&vmi)) + goto close_and_free_vma; if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); @@ -2730,14 +2717,18 @@ expanded: return addr; close_and_free_vma: - if (vma->vm_ops && vma->vm_ops->close) + if (file && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end); + if (file || vma->vm_file) { +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, + vma->vm_end); + } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: From 6b73cff239e52fd43949c40eaabb369298c11aae Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:39 -0500 Subject: [PATCH 377/505] mm: change munmap splitting order and move_vma() Splitting can be more efficient when the order is not of concern. Change do_vmi_align_munmap() to reduce walking of the tree during split operations. move_vma() must also be altered to remove the dependency of keeping the original VMA as the active part of the split. Transition to using vma iterator to look up the prev and/or next vma after munmap. [Liam.Howlett@oracle.com: fix vma iterator initialization] Link: https://lkml.kernel.org/r/20230126212011.980350-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-39-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++---------------- mm/mremap.c | 28 +++++++++++++++++----------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 95ea613a4378..29ffd58d4091 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2329,21 +2329,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { - struct vm_area_struct *split; - - error = __split_vma(vmi, next, end, 1); + error = __split_vma(vmi, next, end, 0); if (error) goto end_split_failed; - - split = vma_prev(vmi); - error = munmap_sidetree(split, &mas_detach); - if (error) - goto munmap_sidetree_failed; - - count++; - if (vma == next) - vma = split; - break; } error = munmap_sidetree(next, &mas_detach); if (error) @@ -2356,9 +2344,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #endif } - if (!next) - next = vma_next(vmi); - + next = vma_next(vmi); if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas diff --git a/mm/mremap.c b/mm/mremap.c index 2176f0cc7f9a..6c7f49ab7d19 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -580,11 +580,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long vm_flags = vma->vm_flags; unsigned long new_pgoff; unsigned long moved_len; - unsigned long excess = 0; + unsigned long account_start = 0; + unsigned long account_end = 0; unsigned long hiwater_vm; - int split = 0; int err = 0; bool need_rmap_locks; + struct vma_iterator vmi; /* * We'd prefer to avoid failure later on in do_munmap: @@ -662,10 +663,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; - excess = vma->vm_end - vma->vm_start - old_len; - if (old_addr > vma->vm_start && - old_addr + old_len < vma->vm_end) - split = 1; + if (vma->vm_start < old_addr) + account_start = vma->vm_start; + if (vma->vm_end > old_addr + old_len) + account_end = vma->vm_end; } /* @@ -700,11 +701,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } - if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + vma_iter_init(&vmi, mm, old_addr); + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); - excess = 0; + account_start = account_end = 0; } if (vm_flags & VM_LOCKED) { @@ -715,10 +717,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (excess) { + if (account_start) { + vma = vma_prev(&vmi); + vma->vm_flags |= VM_ACCOUNT; + } + + if (account_end) { + vma = vma_next(&vmi); vma->vm_flags |= VM_ACCOUNT; - if (split) - find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; From e3d73f848e5f2e9da46646c97fb127dfc6868767 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:40 -0500 Subject: [PATCH 378/505] mm/mmap: move anon_vma setting in __vma_adjust() Move the anon_vma setting & warn_no up the function. This is done to clear up the locking later. Link: https://lkml.kernel.org/r/20230120162650.984577-40-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 29ffd58d4091..12545ec9cdeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + + if (anon_vma) + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -703,12 +711,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, } } - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; if (anon_vma) { - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) From 440703e082b9c79c3d4fffcca8c2dffd621e6dc5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:41 -0500 Subject: [PATCH 379/505] mm/mmap: refactor locking out of __vma_adjust() Move the locking into vma_prepare() and vma_complete() for use elsewhere Link: https://lkml.kernel.org/r/20230120162650.984577-41-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 14 +++ mm/mmap.c | 231 +++++++++++++++++++++++++++++--------------------- 2 files changed, 150 insertions(+), 95 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index ffd65248f266..90bb2078444c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -941,4 +941,18 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 12545ec9cdeb..1ef284b7e6fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -573,6 +573,127 @@ nomem: return -ENOMEM; } +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static inline void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static inline void vma_complete(struct vma_prepare *vp, + struct vma_iterator *vmi, struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->file, + vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove next_next too. + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -588,14 +709,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *next_next = NULL; /* uninit var warning */ struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; + struct vma_prepare vma_prep; if (next && !insert) { if (end >= next->vm_end) { @@ -691,39 +811,22 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, anon_vma != next->anon_vma); vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - if (adjust_next) - uprobe_munmap(next, next->vm_start, next->vm_end); - - i_mmap_lock_write(mapping); - if (insert && insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert, insert->vm_file->f_mapping); - } + memset(&vma_prep, 0, sizeof(vma_prep)); + vma_prep.vma = vma; + vma_prep.anon_vma = anon_vma; + vma_prep.file = file; + if (adjust_next) + vma_prep.adj_next = next; + if (file) + vma_prep.mapping = file->f_mapping; + vma_prep.insert = insert; + if (remove_next) { + vma_prep.remove = next; + vma_prep.remove2 = next_next; } - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_pre_update_vma(next); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - if (adjust_next) - vma_interval_tree_remove(next, root); - } + vma_prepare(&vma_prep); if (start != vma->vm_start) { if (vma->vm_start < start) { @@ -761,69 +864,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, next); } - if (file) { - if (adjust_next) - vma_interval_tree_insert(next, root); - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, insert); - mm->map_count++; - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - - if (adjust_next) - uprobe_mmap(next); - } - - if (remove_next) { -again: - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - if (remove_next != 2) - BUG_ON(vma->vm_end < next->vm_end); - vm_area_free(next); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. - */ - if (remove_next == 2) { - remove_next = 1; - next = next_next; - goto again; - } - } - if (insert && file) - uprobe_mmap(insert); - + vma_complete(&vma_prep, vmi, mm); vma_iter_free(vmi); validate_mm(mm); From 9303d3e1c3f8d8863d561a070489cf44ce5e4103 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:42 -0500 Subject: [PATCH 380/505] mm/mmap: use vma_prepare() and vma_complete() in vma_expand() Use the new locking functions for vma_expand(). This reduces code duplication. At the same time change VM_BUG_ON() to VM_WARN_ON() Link: https://lkml.kernel.org/r/20230120162650.984577-42-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 188 +++++++++++++++++++++--------------------------------- 1 file changed, 72 insertions(+), 116 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1ef284b7e6fb..b5bf70db3777 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,122 +457,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } -/* - * vma_expand - Expand an existing VMA - * - * @mas: The maple state - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. - * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. - * - * Returns: 0 on success - */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) -{ - struct mm_struct *mm = vma->vm_mm; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; - struct anon_vma *anon_vma = vma->anon_vma; - struct file *file = vma->vm_file; - bool remove_next = false; - - if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; - - anon_vma = next->anon_vma; - vma->anon_vma = anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } - } - - /* Not merging but overwriting any part of next is not handled. */ - VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); - /* Only handles expanding */ - VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); - - if (vma_iter_prealloc(vmi)) - goto nomem; - - vma_adjust_trans_huge(vma, start, end, 0); - - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - i_mmap_lock_write(mapping); - } - - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - } - - /* VMA iterator points to previous, so set to start if necessary */ - if (vma_iter_addr(vmi) != start) - vma_iter_set(vmi, start); - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - vma_iter_store(vmi, vma); - - if (file) { - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - /* Expanding over the next vma */ - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - } - - if (remove_next) { - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - vm_area_free(next); - } - - validate_mm(mm); - return 0; - -nomem: - return -ENOMEM; -} - /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -694,6 +578,78 @@ again: uprobe_mmap(vp->insert); } +/* + * vma_expand - Expand an existing VMA + * + * @vmi: The vma iterator + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct vma_prepare vp; + + memset(&vp, 0, sizeof(vp)); + vp.vma = vma; + vp.anon_vma = vma->anon_vma; + if (next && (vma != next) && (end == next->vm_end)) { + vp.remove = next; + if (next->anon_vma && !vma->anon_vma) { + int error; + + vp.anon_vma = next->anon_vma; + vma->anon_vma = next->anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_WARN_ON(next && !vp.remove && + next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + + if (vma_iter_prealloc(vmi)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + vp.file = vma->vm_file; + if (vp.file) + vp.mapping = vp.file->f_mapping; + + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + + vma_prepare(&vp); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_iter_store(vmi, vma); + + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; + +nomem: + return -ENOMEM; +} /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. From 68cefec539206ea691409f0d1e944ed343f23f04 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:43 -0500 Subject: [PATCH 381/505] mm/mmap: introduce init_vma_prep() and init_multi_vma_prep() Add init_vma_prep() and init_multi_vma_prep() to set up the struct vma_prepare. This is to abstract the locking when adjusting the VMAs. Also change __vma_adjust() variable remove_next int in favour of a pointer to the VMA to remove. Rename next_next to remove2 since this better reflects its use. Link: https://lkml.kernel.org/r/20230120162650.984577-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 108 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b5bf70db3777..e259b33fcb52 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -457,6 +457,45 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } +/* + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed + */ +static inline void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, struct vm_area_struct *next, + struct vm_area_struct *remove, struct vm_area_struct *remove2) +{ + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static inline void init_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + + /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct @@ -566,7 +605,7 @@ again: /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. + * we must remove the one after next as well. */ if (vp->remove2) { vp->remove = vp->remove2; @@ -599,17 +638,14 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { + bool remove_next = false; struct vma_prepare vp; - memset(&vp, 0, sizeof(vp)); - vp.vma = vma; - vp.anon_vma = vma->anon_vma; if (next && (vma != next) && (end == next->vm_end)) { - vp.remove = next; + remove_next = true; if (next->anon_vma && !vma->anon_vma) { int error; - vp.anon_vma = next->anon_vma; vma->anon_vma = next->anon_vma; error = anon_vma_clone(vma, next); if (error) @@ -617,6 +653,7 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, } } + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON(next && !vp.remove && next != vma && end > next->vm_start); @@ -627,11 +664,6 @@ inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, goto nomem; vma_adjust_trans_huge(vma, start, end, 0); - - vp.file = vma->vm_file; - if (vp.file) - vp.mapping = vp.file->f_mapping; - /* VMA iterator points to previous, so set to start if necessary */ if (vma_iter_addr(vmi) != start) vma_iter_set(vmi, start); @@ -662,14 +694,13 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *insert, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *remove2 = NULL; + struct vm_area_struct *remove = NULL; struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; - struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - int remove_next = 0; struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; @@ -688,25 +719,24 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, */ VM_WARN_ON(end != next->vm_end); /* - * remove_next == 3 means we're - * removing "vma" and that to do so we + * we're removing "vma" and that to do so we * swapped "vma" and "next". */ - remove_next = 3; VM_WARN_ON(file != next->vm_file); swap(vma, next); + remove = next; } else { VM_WARN_ON(expand != vma); /* - * case 1, 6, 7, remove_next == 2 is case 6, - * remove_next == 1 is case 1 or 7. + * case 1, 6, 7, remove next. + * case 6 also removes the one beyond next */ - remove_next = 1 + (end > next->vm_end); - if (remove_next == 2) - next_next = find_vma(mm, next->vm_end); + remove = next; + if (end > next->vm_end) + remove2 = find_vma(mm, next->vm_end); - VM_WARN_ON(remove_next == 2 && - end != next_next->vm_end); + VM_WARN_ON(remove2 != NULL && + end != remove2->vm_end); } exporter = next; @@ -716,8 +746,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && !next->anon_vma) - exporter = next_next; + if (remove2 != NULL && !next->anon_vma) + exporter = remove2; } else if (end > next->vm_start) { /* @@ -758,30 +788,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) return -ENOMEM; - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; - - if (anon_vma) - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - memset(&vma_prep, 0, sizeof(vma_prep)); - vma_prep.vma = vma; - vma_prep.anon_vma = anon_vma; - vma_prep.file = file; - if (adjust_next) - vma_prep.adj_next = next; - if (file) - vma_prep.mapping = file->f_mapping; - vma_prep.insert = insert; - if (remove_next) { - vma_prep.remove = next; - vma_prep.remove2 = next_next; - } + init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, + remove2); + VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && + vma_prep.anon_vma != next->anon_vma); + vma_prep.insert = insert; vma_prepare(&vma_prep); if (start != vma->vm_start) { From b2b3b886738fec5e89ca9ebc720eba1a8f615753 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:44 -0500 Subject: [PATCH 382/505] mm: don't use __vma_adjust() in __split_vma() Use the abstracted locking and maple tree operations. Since __split_vma() is the only user of the __vma_adjust() function to use the insert argument, drop that argument. Remove the NULL passed through from fs/exec's shift_arg_pages() and mremap() at the same time. Link: https://lkml.kernel.org/r/20230120162650.984577-44-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 +- include/linux/mm.h | 7 ++- mm/mmap.c | 118 +++++++++++++++++++++------------------------ mm/mremap.c | 2 +- 4 files changed, 61 insertions(+), 70 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 76ee62e1d3f1..d52fca2dd30b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff, NULL); + return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e95287a9f74..3845de5d2581 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2832,13 +2832,12 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); + unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); static inline int vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff, struct vm_area_struct *insert) + pgoff_t pgoff) { - return __vma_adjust(vmi, vma, start, end, pgoff, insert, NULL); + return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index e259b33fcb52..3d08d7d243f0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -691,7 +691,7 @@ nomem: */ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *insert, struct vm_area_struct *expand) + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *remove2 = NULL; @@ -704,7 +704,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; - if (next && !insert) { + if (next) { if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -795,39 +795,25 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && vma_prep.anon_vma != next->anon_vma); - vma_prep.insert = insert; vma_prepare(&vma_prep); - if (start != vma->vm_start) { - if (vma->vm_start < start) { - if (!insert || (insert->vm_end != start)) { - vma_iter_clear(vmi, vma->vm_start, start); - vma_iter_set(vmi, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); - } - } else { - vma_changed = true; - } - vma->vm_start = start; - } - if (end != vma->vm_end) { - if (vma->vm_end > end) { - if (!insert || (insert->vm_start != end)) { - vma_iter_clear(vmi, end, vma->vm_end); - vma_iter_set(vmi, vma->vm_end); - VM_WARN_ON(insert && - insert->vm_end < vma->vm_end); - } - } else { - vma_changed = true; - } - vma->vm_end = end; - } + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + else if (start != vma->vm_start) + vma_changed = true; + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + else if (end != vma->vm_end) + vma_changed = true; + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; if (vma_changed) vma_iter_store(vmi, vma); - vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; @@ -846,9 +832,9 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1030,20 +1016,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); + next->vm_end, prev->vm_pgoff, prev); res = prev; } else if (merge_prev) { /* cases 2, 5, 7 */ err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); + end, prev->vm_pgoff, prev); res = prev; } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); + addr, prev->vm_pgoff, next); else /* cases 3, 8 */ err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); + next->vm_pgoff - pglen, next); res = next; } @@ -2187,11 +2172,15 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { + struct vma_prepare vp; struct vm_area_struct *new; int err; validate_mm_mt(vma->vm_mm); + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); + if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) @@ -2202,16 +2191,20 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) return -ENOMEM; - if (new_below) + err = -ENOMEM; + if (vma_iter_prealloc(vmi)) + goto out_free_vma; + + if (new_below) { new->vm_end = addr; - else { + } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = vma_dup_policy(vma, new); if (err) - goto out_free_vma; + goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) @@ -2223,33 +2216,32 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) - err = vma_adjust(vmi, vma, addr, vma->vm_end, - vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), - new); - else - err = vma_adjust(vmi, vma, vma->vm_start, addr, vma->vm_pgoff, - new); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); - /* Success. */ - if (!err) { - if (new_below) - vma_next(vmi); - return 0; + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; } - /* Avoid vm accounting in close() operation */ - new->vm_start = new->vm_end; - new->vm_pgoff = 0; - /* Clean everything up if vma_adjust failed. */ - if (new->vm_ops && new->vm_ops->close) - new->vm_ops->close(new); - if (new->vm_file) - fput(new->vm_file); - unlink_anon_vmas(new); - out_free_mpol: + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); + + /* Success. */ + if (new_below) + vma_next(vmi); + validate_mm_mt(vma->vm_mm); + return 0; + +out_free_mpol: mpol_put(vma_policy(new)); - out_free_vma: +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: vm_area_free(new); validate_mm_mt(vma->vm_mm); return err; diff --git a/mm/mremap.c b/mm/mremap.c index 6c7f49ab7d19..b98185f48ba5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1054,7 +1054,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL)) { + addr + new_len, vma->vm_pgoff)) { vma = NULL; } if (!vma) { From 7c9813e886bb52495ff5b97d4b0f1320d36d869b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:45 -0500 Subject: [PATCH 383/505] mm/mremap: convert vma_adjust() to vma_expand() Stop using vma_adjust() in preparation for removing the function. Export vma_expand() to use instead. Link: https://lkml.kernel.org/r/20230120162650.984577-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ mm/mmap.c | 6 +++--- mm/mremap.c | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3845de5d2581..245fb30858c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2839,6 +2839,9 @@ static inline int vma_adjust(struct vma_iterator *vmi, { return __vma_adjust(vmi, vma, start, end, pgoff, NULL); } +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 3d08d7d243f0..9599db011b18 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -634,9 +634,9 @@ again: * * Returns: 0 on success */ -inline int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { bool remove_next = false; struct vma_prepare vp; diff --git a/mm/mremap.c b/mm/mremap.c index b98185f48ba5..5c9a57909862 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1053,8 +1053,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(&vmi, vma, vma->vm_start, - addr + new_len, vma->vm_pgoff)) { + } else if (vma_expand(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { From cf51e86dfbe39b7cae3a9de650d035af22dd5fb4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:46 -0500 Subject: [PATCH 384/505] mm/mmap: don't use __vma_adjust() in shift_arg_pages() Introduce shrink_vma() which uses the vma_prepare() and vma_complete() functions to reduce the vma coverage. Convert shift_arg_pages() to use expand_vma() and the new shrink_vma() function. Remove support from __vma_adjust() to reduce a vma size since shift_arg_pages() is the only user that shrinks a VMA in this way. Link: https://lkml.kernel.org/r/20230120162650.984577-46-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/exec.c | 4 ++-- include/linux/mm.h | 10 ++------- mm/mmap.c | 52 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d52fca2dd30b..c0df813d2b45 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(&vmi, vma, new_start, old_end, vma->vm_pgoff)) + if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -733,7 +733,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) vma_prev(&vmi); /* Shrink the vma to just the new range */ - return vma_adjust(&vmi, vma, new_start, new_end, vma->vm_pgoff); + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 245fb30858c9..dcc34533d2f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2831,17 +2831,11 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *expand); -static inline int vma_adjust(struct vma_iterator *vmi, - struct vm_area_struct *vma, unsigned long start, unsigned long end, - pgoff_t pgoff) -{ - return __vma_adjust(vmi, vma, start, end, pgoff, NULL); -} extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, diff --git a/mm/mmap.c b/mm/mmap.c index 9599db011b18..07b52acfd565 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -682,6 +682,44 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, nomem: return -ENOMEM; } + +/* + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise + */ +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) +{ + struct vma_prepare vp; + + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); + + if (vma_iter_prealloc(vmi)) + return -ENOMEM; + + init_vma_prep(&vp, vma); + vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); + + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); + + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -797,14 +835,7 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vma_prep); - if (vma->vm_start < start) - vma_iter_clear(vmi, vma->vm_start, start); - else if (start != vma->vm_start) - vma_changed = true; - - if (vma->vm_end > end) - vma_iter_clear(vmi, end, vma->vm_end); - else if (end != vma->vm_end) + if (start < vma->vm_start || end > vma->vm_end) vma_changed = true; vma->vm_start = start; @@ -817,7 +848,10 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_iter_store(vmi, next); + if (adjust_next < 0) { + WARN_ON_ONCE(vma_changed); + vma_iter_store(vmi, next); + } } vma_complete(&vma_prep, vmi, mm); From 04241ffe3f0458d54c61cf6c9d58d703efda4dd5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:47 -0500 Subject: [PATCH 385/505] mm/mmap: introduce dup_vma_anon() helper Create a helper for duplicating the anon vma when adjusting the vma. This simplifies the logic of __vma_adjust(). Link: https://lkml.kernel.org/r/20230120162650.984577-47-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 74 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 07b52acfd565..265c4605dad2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -617,6 +617,29 @@ again: uprobe_mmap(vp->insert); } +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * + * Returns: 0 on success. + */ +static inline int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + dst->anon_vma = src->anon_vma; + return anon_vma_clone(dst, src); + } + + return 0; +} + /* * vma_expand - Expand an existing VMA * @@ -642,15 +665,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vma_prepare vp; if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; + int ret; - vma->anon_vma = next->anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } + remove_next = true; + ret = dup_anon_vma(vma, next); + if (ret) + return ret; } init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); @@ -739,10 +759,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, struct file *file = vma->vm_file; bool vma_changed = false; long adjust_next = 0; - struct vm_area_struct *exporter = NULL, *importer = NULL; struct vma_prepare vma_prep; if (next) { + int error = 0; + if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -777,15 +798,14 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, end != remove2->vm_end); } - exporter = next; - importer = vma; - /* * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove2 != NULL && !next->anon_vma) - exporter = remove2; + if (remove != NULL && !next->anon_vma) + error = dup_anon_vma(vma, remove2); + else + error = dup_anon_vma(vma, remove); } else if (end > next->vm_start) { /* @@ -793,9 +813,8 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 5 shifting the boundary up. */ adjust_next = (end - next->vm_start); - exporter = next; - importer = vma; - VM_WARN_ON(expand != importer); + VM_WARN_ON(expand != vma); + error = dup_anon_vma(vma, next); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -803,24 +822,11 @@ int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, * mprotect case 4 shifting the boundary down. */ adjust_next = -(vma->vm_end - end); - exporter = vma; - importer = next; - VM_WARN_ON(expand != importer); - } - - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (exporter && exporter->anon_vma && !importer->anon_vma) { - int error; - - importer->anon_vma = exporter->anon_vma; - error = anon_vma_clone(importer, exporter); - if (error) - return error; + VM_WARN_ON(expand != next); + error = dup_anon_vma(next, vma); } + if (error) + return error; } if (vma_iter_prealloc(vmi)) From 287051b185048c4aabe666439e64232ac59135a6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:48 -0500 Subject: [PATCH 386/505] mm/mmap: convert do_brk_flags() to use vma_prepare() and vma_complete() Use the abstracted vma locking for do_brk_flags() Link: https://lkml.kernel.org/r/20230120162650.984577-48-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 265c4605dad2..604ba8293a95 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2936,6 +2936,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vma_prepare vp; validate_mm_mt(mm); /* @@ -2963,18 +2964,13 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - if (vma->anon_vma) { - anon_vma_lock_write(vma->anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } + init_vma_prep(&vp, vma); + vma_prepare(&vp); vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; vma_iter_store(vmi, vma); - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } + vma_complete(&vp, vmi, mm); khugepaged_enter_vma(vma, flags); goto out; } From 0503ea8f5ba73eb3ab13a81c1eefbaf51405385a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:49 -0500 Subject: [PATCH 387/505] mm/mmap: remove __vma_adjust() Inline the work of __vma_adjust() into vma_merge(). This reduces code size and has the added benefits of the comments for the cases being located with the code. Change the comments referencing vma_adjust() accordingly. [Liam.Howlett@oracle.com: fix vma_merge() offset when expanding the next vma] Link: https://lkml.kernel.org/r/20230130195713.2881766-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230120162650.984577-49-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 250 ++++++++++++++++------------------------ mm/rmap.c | 15 +-- 4 files changed, 107 insertions(+), 162 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1a3904e0179c..59887c69d54c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/mm/filemap.c b/mm/filemap.c index c915ded191f0..992554c18f1f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -97,7 +97,7 @@ * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem - * ->anon_vma.lock (vma_adjust) + * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) diff --git a/mm/mmap.c b/mm/mmap.c index 604ba8293a95..8ce4cee42dce 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -740,133 +740,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } -/* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. - */ -int __vma_adjust(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *expand) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *remove2 = NULL; - struct vm_area_struct *remove = NULL; - struct vm_area_struct *next = find_vma(mm, vma->vm_end); - struct vm_area_struct *orig_vma = vma; - struct file *file = vma->vm_file; - bool vma_changed = false; - long adjust_next = 0; - struct vma_prepare vma_prep; - - if (next) { - int error = 0; - - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - * The only other cases that gets here are - * case 1, case 7 and case 8. - */ - if (next == expand) { - /* - * The only case where we don't expand "vma" - * and we expand "next" instead is case 8. - */ - VM_WARN_ON(end != next->vm_end); - /* - * we're removing "vma" and that to do so we - * swapped "vma" and "next". - */ - VM_WARN_ON(file != next->vm_file); - swap(vma, next); - remove = next; - } else { - VM_WARN_ON(expand != vma); - /* - * case 1, 6, 7, remove next. - * case 6 also removes the one beyond next - */ - remove = next; - if (end > next->vm_end) - remove2 = find_vma(mm, next->vm_end); - - VM_WARN_ON(remove2 != NULL && - end != remove2->vm_end); - } - - /* - * If next doesn't have anon_vma, import from vma after - * next, if the vma overlaps with it. - */ - if (remove != NULL && !next->anon_vma) - error = dup_anon_vma(vma, remove2); - else - error = dup_anon_vma(vma, remove); - - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start); - VM_WARN_ON(expand != vma); - error = dup_anon_vma(vma, next); - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = -(vma->vm_end - end); - VM_WARN_ON(expand != next); - error = dup_anon_vma(next, vma); - } - if (error) - return error; - } - - if (vma_iter_prealloc(vmi)) - return -ENOMEM; - - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - - init_multi_vma_prep(&vma_prep, vma, adjust_next ? next : NULL, remove, - remove2); - VM_WARN_ON(vma_prep.anon_vma && adjust_next && next->anon_vma && - vma_prep.anon_vma != next->anon_vma); - - vma_prepare(&vma_prep); - - if (start < vma->vm_start || end > vma->vm_end) - vma_changed = true; - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - - if (vma_changed) - vma_iter_store(vmi, vma); - - if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; - if (adjust_next < 0) { - WARN_ON_ONCE(vma_changed); - vma_iter_store(vmi, next); - } - } - - vma_complete(&vma_prep, vmi, mm); - vma_iter_free(vmi); - validate_mm(mm); - - return 0; -} - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -993,7 +866,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_adjust drops the + * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must @@ -1003,6 +876,12 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. + * + * In the code below: + * PPPP is represented by *prev + * NNNN is represented by *mid (and possibly equal to *next) + * XXXX is represented by *next or not represented at all. + * AAAA is not represented - it will be merged or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -1013,11 +892,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + pgoff_t vma_pgoff; struct vm_area_struct *mid, *next, *res = NULL; + struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; bool merge_next = false; + bool vma_expanded = false; + struct vma_prepare vp; + unsigned long vma_end = end; + long adj_next = 0; + unsigned long vma_start = addr; + validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1035,13 +922,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* Can we merge the predecessor? */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff, - vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; + if (prev) { + res = prev; + vma = prev; + vma_start = prev->vm_start; + vma_pgoff = prev->vm_pgoff; + /* Can we merge the predecessor? */ + if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) + && can_vma_merge_after(prev, vm_flags, anon_vma, file, + pgoff, vm_userfaultfd_ctx, anon_name)) { + merge_prev = true; + } } /* Can we merge the successor? */ if (next && end == next->vm_start && @@ -1051,32 +942,85 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } + + remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, prev); - res = prev; - } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - end, prev->vm_pgoff, prev); - res = prev; + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + remove = mid; /* case 1 */ + vma_end = next->vm_end; + err = dup_anon_vma(res, remove); + if (mid != next) { /* case 6 */ + remove2 = next; + if (!remove->anon_vma) + err = dup_anon_vma(res, remove2); + } + } else if (merge_prev) { + err = 0; /* case 2 */ + if (mid && end > mid->vm_start) { + err = dup_anon_vma(res, mid); + if (end == mid->vm_end) { /* case 7 */ + remove = mid; + } else { /* case 5 */ + adjust = mid; + adj_next = (end - mid->vm_start); + } + } } else if (merge_next) { - if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(vmi, prev, prev->vm_start, - addr, prev->vm_pgoff, next); - else /* cases 3, 8 */ - err = __vma_adjust(vmi, mid, addr, next->vm_end, - next->vm_pgoff - pglen, next); res = next; + if (prev && addr < prev->vm_end) { /* case 4 */ + vma_end = addr; + adjust = mid; + adj_next = -(vma->vm_end - addr); + err = dup_anon_vma(res, adjust); + } else { + vma = next; /* case 3 */ + vma_start = addr; + vma_end = next->vm_end; + vma_pgoff = mid->vm_pgoff; + err = 0; + if (mid != next) { /* case 8 */ + remove = mid; + err = dup_anon_vma(res, remove); + } + } } - /* - * Cannot merge with predecessor or successor or error in __vma_adjust? - */ + /* Cannot merge or error in anon_vma clone */ if (err) return NULL; + + if (vma_iter_prealloc(vmi)) + return NULL; + + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + vma->vm_start = vma_start; + vma->vm_end = vma_end; + vma->vm_pgoff = vma_pgoff; + + if (vma_expanded) + vma_iter_store(vmi, vma); + + if (adj_next) { + adjust->vm_start += adj_next; + adjust->vm_pgoff += adj_next >> PAGE_SHIFT; + if (adj_next < 0) { + WARN_ON(vma_expanded); + vma_iter_store(vmi, next); + } + } + + vma_complete(&vp, vmi, mm); + vma_iter_free(vmi); + validate_mm(mm); khugepaged_enter_vma(res, vm_flags); if (res) diff --git a/mm/rmap.c b/mm/rmap.c index 43760d622040..86fccc2b9fc9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -262,11 +262,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and - * anon_vma_fork(). The first three want an exact copy of src, while the last - * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent - * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, - * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), + * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, + * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to + * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before + * call, we can identify this case by checking (!dst->anon_vma && + * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. @@ -1253,7 +1254,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (likely(!folio_test_ksm(folio))) { - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ if (first) __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); @@ -2524,7 +2525,7 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_adjust */ + /* address might be in next vma when migration races vma_merge */ first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); From 18b098af2890cdeab07368405409111197f190d2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:50 -0500 Subject: [PATCH 388/505] vma_merge: set vma iterator to correct position. When merging the previous value, set the vma iterator to the previous slot. Don't use the vma iterator to get the next/prev so that it is in the correct position for a write. Link: https://lkml.kernel.org/r/20230120162650.984577-50-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 8ce4cee42dce..b698a96d0511 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -932,6 +932,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { merge_prev = true; + vma_prev(vmi); } } /* Can we merge the successor? */ @@ -1023,9 +1024,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, validate_mm(mm); khugepaged_enter_vma(res, vm_flags); - if (res) - vma_iter_set(vmi, end); - return res; } From 06e78b614e3780f9ac32056f2861159fd19d9702 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:46 -0800 Subject: [PATCH 389/505] kernel/fork: convert vma assignment to a memcpy Patch series "introduce vm_flags modifier functions", v4. This patchset was originally published as a part of per-VMA locking [1] and was split after suggestion that it's viable on its own and to facilitate the review process. It is now a preprequisite for the next version of per-VMA lock patchset, which reuses vm_flags modifier functions to lock the VMA when vm_flags are being updated. VMA vm_flags modifications are usually done under exclusive mmap_lock protection because this attrubute affects other decisions like VMA merging or splitting and races should be prevented. Introduce vm_flags modifier functions to enforce correct locking. This patch (of 7): Convert vma assignment in vm_area_dup() to a memcpy() to prevent compiler errors when we add a const modifier to vma->vm_flags. Link: https://lkml.kernel.org/r/20230126193752.297968-1-surenb@google.com Link: https://lkml.kernel.org/r/20230126193752.297968-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Cc: Sebastian Reichel Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 441dcec60aae..9260f975b8f4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ - *new = data_race(*orig); + data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); dup_anon_vma_name(orig, new); } From bc292ab00f6c7a661a8a605c714e8a148f629ef6 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:47 -0800 Subject: [PATCH 390/505] mm: introduce vma->vm_flags wrapper functions vm_flags are among VMA attributes which affect decisions like VMA merging and splitting. Therefore all vm_flags modifications are performed after taking exclusive mmap_lock to prevent vm_flags updates racing with such operations. Introduce modifier functions for vm_flags to be used whenever flags are updated. This way we can better check and control correct locking behavior during these updates. Link: https://lkml.kernel.org/r/20230126193752.297968-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Davidlohr Bueso Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 10 +++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dcc34533d2f6..e2df5d122b67 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -627,6 +627,46 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ca11c6c46e8..10a1e41f4e70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -491,7 +491,15 @@ struct vm_area_struct { * See vmf_insert_mixed_prot() for discussion. */ pgprot_t vm_page_prot; - unsigned long vm_flags; /* Flags, see mm.h. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; /* * For areas with an address space and backing store, From e430a95a04efc557bc4ff9b3035c7c85aee5d63f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:48 -0800 Subject: [PATCH 391/505] mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK To simplify the usage of VM_LOCKED_CLEAR_MASK in vm_flags_clear(), replace it with VM_LOCKED_MASK bitmask and convert all users. Link: https://lkml.kernel.org/r/20230126193752.297968-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Reviewed-by: Davidlohr Bueso Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- kernel/fork.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 6 +++--- mm/mmap.c | 6 +++--- mm/mremap.c | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e2df5d122b67..663726ca2240 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR diff --git a/kernel/fork.c b/kernel/fork.c index 9260f975b8f4..5e3029ea8e1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f9df0143772..ab35b1cc9927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6969,8 +6969,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 0336f52e03d7..5c4fff93cd6b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (vma->vm_start != tmp) return -ENOMEM; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; @@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags) struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; @@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags) for_each_vma(vmi, vma) { vm_flags_t newflags; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ diff --git a/mm/mmap.c b/mm/mmap.c index b698a96d0511..03d7c37c5969 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2668,7 +2668,7 @@ expanded: if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -3338,8 +3338,8 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_init(vma, (vm_flags | mm->def_flags | + VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mremap.c b/mm/mremap.c index 5c9a57909862..d70d8063c6e2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -687,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page From 1c71222e5f2393b5ea1a41795c67589eea7e3490 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:49 -0800 Subject: [PATCH 392/505] mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Sebastian Reichel Reviewed-by: Liam R. Howlett Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/kernel/process.c | 2 +- arch/ia64/mm/init.c | 8 ++++---- arch/loongarch/include/asm/tlb.h | 2 +- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 14 +++++++------- arch/s390/mm/gmap.c | 3 +-- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/cpu/sgx/driver.c | 2 +- arch/x86/kernel/cpu/sgx/virt.c | 2 +- arch/x86/mm/pat/memtype.c | 6 +++--- arch/x86/um/mem_32.c | 2 +- drivers/acpi/pfr_telemetry.c | 2 +- drivers/android/binder.c | 3 +-- drivers/char/mspec.c | 2 +- drivers/crypto/hisilicon/qm.c | 2 +- drivers/dax/device.c | 2 +- drivers/dma/idxd/cdev.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/drm_gem_dma_helper.c | 3 +-- drivers/gpu/drm/drm_gem_shmem_helper.c | 2 +- drivers/gpu/drm/drm_vm.c | 8 ++++---- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- drivers/gpu/drm/exynos/exynos_drm_gem.c | 4 ++-- drivers/gpu/drm/gma500/framebuffer.c | 2 +- drivers/gpu/drm/i810/i810_dma.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 ++-- drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +- drivers/gpu/drm/msm/msm_gem.c | 2 +- drivers/gpu/drm/omapdrm/omap_gem.c | 3 +-- drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 3 +-- drivers/gpu/drm/tegra/gem.c | 5 ++--- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 +-- drivers/gpu/drm/virtio/virtgpu_vram.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 2 +- drivers/gpu/drm/xen/xen_drm_front_gem.c | 3 +-- drivers/hsi/clients/cmt_speech.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/hwtracing/stm/core.c | 2 +- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- drivers/infiniband/hw/mlx5/main.c | 4 ++-- drivers/infiniband/hw/qib/qib_file_ops.c | 13 ++++++------- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +- .../media/common/videobuf2/videobuf2-dma-contig.c | 2 +- drivers/media/common/videobuf2/videobuf2-vmalloc.c | 2 +- drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 4 ++-- drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- drivers/misc/cxl/context.c | 2 +- drivers/misc/habanalabs/common/memory.c | 2 +- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++---- drivers/misc/habanalabs/goya/goya.c | 4 ++-- drivers/misc/ocxl/context.c | 4 ++-- drivers/misc/ocxl/sysfs.c | 2 +- drivers/misc/open-dice.c | 4 ++-- drivers/misc/sgi-gru/grufile.c | 4 ++-- drivers/misc/uacce/uacce.c | 2 +- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/cxlflash/ocxl_hw.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +- drivers/staging/media/deprecated/meye/meye.c | 4 ++-- .../media/deprecated/stkwebcam/stk-webcam.c | 2 +- drivers/target/target_core_user.c | 2 +- drivers/uio/uio.c | 2 +- drivers/usb/core/devio.c | 3 +-- drivers/usb/mon/mon_bin.c | 3 +-- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vhost/vdpa.c | 2 +- drivers/video/fbdev/68328fb.c | 2 +- drivers/video/fbdev/core/fb_defio.c | 4 ++-- drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 4 ++-- drivers/xen/privcmd-buf.c | 2 +- drivers/xen/privcmd.c | 4 ++-- fs/aio.c | 2 +- fs/cramfs/inode.c | 2 +- fs/erofs/data.c | 2 +- fs/exec.c | 4 ++-- fs/ext4/file.c | 2 +- fs/fuse/dax.c | 2 +- fs/hugetlbfs/inode.c | 4 ++-- fs/orangefs/file.c | 3 +-- fs/proc/task_mmu.c | 2 +- fs/proc/vmcore.c | 3 +-- fs/userfaultfd.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/mm.h | 2 +- kernel/bpf/ringbuf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++-- kernel/events/core.c | 2 +- kernel/kcov.c | 2 +- kernel/relay.c | 2 +- mm/madvise.c | 2 +- mm/memory.c | 6 +++--- mm/mlock.c | 6 +++--- mm/mmap.c | 10 +++++----- mm/mprotect.c | 2 +- mm/mremap.c | 6 +++--- mm/nommu.c | 11 ++++++----- mm/secretmem.c | 2 +- mm/shmem.c | 2 +- mm/vmalloc.c | 2 +- net/ipv4/tcp.c | 4 ++-- security/selinux/selinuxfs.c | 6 +++--- sound/core/oss/pcm_oss.c | 2 +- sound/core/pcm_native.c | 9 +++++---- sound/soc/pxa/mmp-sspa.c | 2 +- sound/usb/usx2y/us122l.c | 4 ++-- sound/usb/usx2y/usX2Yhwdep.c | 2 +- sound/usb/usx2y/usx2yhwdeppcm.c | 2 +- 120 files changed, 188 insertions(+), 199 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f811733a8fc5..61c30b9a24ea 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -316,7 +316,7 @@ static int __init gate_vma_init(void) gate_vma.vm_page_prot = PAGE_READONLY_EXEC; gate_vma.vm_start = 0xffff0000; gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; - gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC); return 0; } arch_initcall(gate_vma_init); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index fc4e4217e87f..7f5353e28516 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -109,7 +109,7 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { @@ -127,8 +127,8 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { mmap_write_unlock(current->mm); @@ -272,7 +272,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX); return 0; diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index dd24f5898f65..f5e4deb97402 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -149,7 +149,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) struct vm_area_struct vma; vma.vm_mm = tlb->mm; - vma.vm_flags = 0; + vm_flags_init(&vma, 0); if (tlb->fullmm) { flush_tlb_mm(tlb->mm); return; diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 4f566bea5e10..712ab91ced39 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -324,7 +324,7 @@ static int kvmppc_xive_native_mmap(struct kvm_device *dev, return -EINVAL; } - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); /* diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index d73b3b4176e8..b75a9fb99599 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -156,7 +156,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * VM_NOHUGEPAGE and split them. */ for_each_vma_range(vmi, vma, addr + len) { - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); } } diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 9580e8e12165..36c21648d19a 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -525,7 +525,7 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) pfn = paste_addr >> PAGE_SHIFT; /* flags, page_prot from cxl_mmap(), except we want cachable */ - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_cached(vma->vm_page_prot); prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 62d90a5e23d1..02a8158c469d 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -291,7 +291,7 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); vma->vm_ops = &spufs_mem_mmap_vmops; @@ -381,7 +381,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_cntl_mmap_vmops; @@ -1043,7 +1043,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal1_mmap_vmops; @@ -1179,7 +1179,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal2_mmap_vmops; @@ -1302,7 +1302,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mss_mmap_vmops; @@ -1364,7 +1364,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_psmap_mmap_vmops; @@ -1424,7 +1424,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mfc_mmap_vmops; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 69af6cdf1a2a..ab836597419d 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2522,8 +2522,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 4af81df133ee..d234ca797e4a 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -391,7 +391,7 @@ void __init map_vsyscall(void) } if (vsyscall_mode == XONLY) - gate_vma.vm_flags = VM_EXEC; + vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b868867..262f5fb18d74 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma->vm_ops = &sgx_vm_ops; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); vma->vm_private_data = encl; return 0; diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 6a77a14eee38..c3e37eaec8ec 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &sgx_vepc_vm_ops; /* Don't copy VMA in fork() */ - vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); vma->vm_private_data = vepc; return 0; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index fb4b1b5e0dea..6ca51b1aa5d9 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1000,7 +1000,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) - vma->vm_flags |= VM_PAT; + vm_flags_set(vma, VM_PAT); return ret; } @@ -1066,7 +1066,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } free_pfn_range(paddr, size); if (vma) - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } /* @@ -1076,7 +1076,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, */ void untrack_pfn_moved(struct vm_area_struct *vma) { - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index cafd01f730da..29b2203bc82c 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = PAGE_READONLY; return 0; diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 27fb6cdad75f..843f678ade0c 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -310,7 +310,7 @@ pfrt_log_mmap(struct file *file, struct vm_area_struct *vma) return -EROFS; /* changing from read to write with mprotect is not allowed */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); pfrt_log_dev = to_pfrt_log_dev(file); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 880224ec6abb..cb08982b9666 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5572,8 +5572,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } - vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f8231e2e84be..b35f651837c8 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -206,7 +206,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, refcount_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); if (vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 007ac7a69ce7..733fe1033910 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2363,7 +2363,7 @@ static int hisi_qm_uacce_mmap(struct uacce_queue *q, return -EINVAL; } - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); return remap_pfn_range(vma, vma->vm_start, phys_base >> PAGE_SHIFT, diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 5494d745ced5..223e4e233d19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -308,7 +308,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e13e92609943..674bfefca088 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -201,7 +201,7 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (rc < 0) return rc; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index bb7350ea1d75..a69fd6fdabb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -257,7 +257,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str */ if (is_cow_mapping(vma->vm_flags) && !(vma->vm_flags & VM_ACCESS_FLAGS)) - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return drm_gem_ttm_mmap(obj, vma); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6d291aa6386b..d0933dd9af06 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2879,8 +2879,8 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, address = dev->adev->rmmio_remap.bus_addr; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index cd4e61bf0493..cbef2e147da5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -159,8 +159,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, address = kfd_get_process_doorbells(pdd); if (!address) return -ENOMEM; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 729d26d648af..dd0436bf349a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1052,8 +1052,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) pfn = __pa(page->kernel_address); pfn >>= PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE - | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP); pr_debug("Mapping signal page\n"); pr_debug(" start user address == 0x%08lx\n", vma->vm_start); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 51b1683ac5c1..1fad0ecdfaeb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1978,8 +1978,8 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, return -ENOMEM; } - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP); /* Mapping pages to user process */ return remap_pfn_range(vma, vma->vm_start, PFN_DOWN(__pa(qpd->cwsr_kaddr)), diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index b8db675e7fb5..54c76003d2cc 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1047,7 +1047,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, goto err_drm_gem_object_put; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); } diff --git a/drivers/gpu/drm/drm_gem_dma_helper.c b/drivers/gpu/drm/drm_gem_dma_helper.c index 1e658c448366..fb2c764accc6 100644 --- a/drivers/gpu/drm/drm_gem_dma_helper.c +++ b/drivers/gpu/drm/drm_gem_dma_helper.c @@ -530,8 +530,7 @@ int drm_gem_dma_mmap(struct drm_gem_dma_object *dma_obj, struct vm_area_struct * * the whole buffer. */ vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTEXPAND, VM_PFNMAP); if (dma_obj->map_noncoherent) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index b602cd72a120..a2c28483e010 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -633,7 +633,7 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct if (ret) return ret; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); if (shmem->map_wc) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index f024dc93939e..87c9fe55dec7 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -476,7 +476,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) if (!capable(CAP_SYS_ADMIN) && (dma->flags & _DRM_DMA_USE_PCI_RO)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -492,7 +492,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; @@ -560,7 +560,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) return -EINVAL; if (!capable(CAP_SYS_ADMIN) && (map->flags & _DRM_READ_ONLY)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -628,7 +628,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index c5ae5492e1af..b5f73502e3dd 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -130,7 +130,7 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj, { pgprot_t vm_page_prot; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 3e493f48e0d4..638ca96830e9 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -274,7 +274,7 @@ static int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem *exynos_gem, unsigned long vm_size; int ret; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; vm_size = vma->vm_end - vma->vm_start; @@ -368,7 +368,7 @@ static int exynos_drm_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct if (obj->import_attach) return dma_buf_mmap(obj->dma_buf, vma, 0); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); DRM_DEV_DEBUG_KMS(to_dma_dev(obj->dev), "flags = 0x%x\n", exynos_gem->flags); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 8d5a37b8f110..a9276c8a3e4e 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -139,7 +139,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)fb; - vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c index 9fb4dd63342f..01967dd88762 100644 --- a/drivers/gpu/drm/i810/i810_dma.c +++ b/drivers/gpu/drm/i810/i810_dma.c @@ -102,7 +102,7 @@ static int i810_mmap_buffers(struct file *filp, struct vm_area_struct *vma) buf = dev_priv->mmap_buffer; buf_priv = buf->dev_private; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); buf_priv->currently_mapped = I810_BUF_MAPPED; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 0ad44f3868de..e95f4c729ca5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -979,7 +979,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) i915_gem_object_put(obj); return -EINVAL; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } anon = mmap_singleton(to_i915(dev)); @@ -988,7 +988,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) return PTR_ERR(anon); } - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); /* * We keep the ref on mmo->obj, not vm_file, but we require diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c index 47e96b0289f9..28659514bf20 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c @@ -158,7 +158,7 @@ static int mtk_drm_gem_object_mmap(struct drm_gem_object *obj, * dma_alloc_attrs() allocated a struct page table for mtk_gem, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 1dee0d18abbb..c2fb98a94bc3 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1012,7 +1012,7 @@ static int msm_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct { struct msm_gem_object *msm_obj = to_msm_bo(obj); - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = msm_gem_pgprot(msm_obj, vm_get_page_prot(vma->vm_flags)); return 0; diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index cf571796fd26..19fef933904b 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -543,8 +543,7 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj, { struct omap_gem_object *omap_obj = to_omap_bo(obj); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); if (omap_obj->flags & OMAP_BO_WC) { vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index 6edb7c52cb3d..8ea09d915c3c 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -251,8 +251,7 @@ static int rockchip_drm_gem_object_mmap(struct drm_gem_object *obj, * We allocated a struct page table for rk_obj, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 979e7bc902f6..bce991a2ccc0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -574,7 +574,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) * and set the vm_pgoff (used as a fake buffer offset by DRM) * to 0 as we want to map the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; err = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->iova, @@ -588,8 +588,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) } else { pgprot_t prot = vm_get_page_prot(vma->vm_flags); - vma->vm_flags |= VM_MIXEDMAP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(prot); } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 5a3e4b891377..c00207582c74 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -468,8 +468,7 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_private_data = bo; - vma->vm_flags |= VM_PFNMAP; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(ttm_bo_mmap_obj); diff --git a/drivers/gpu/drm/virtio/virtgpu_vram.c b/drivers/gpu/drm/virtio/virtgpu_vram.c index 6b45b0429fef..25df81c02783 100644 --- a/drivers/gpu/drm/virtio/virtgpu_vram.c +++ b/drivers/gpu/drm/virtio/virtgpu_vram.c @@ -46,7 +46,7 @@ static int virtio_gpu_vram_mmap(struct drm_gem_object *obj, return -EINVAL; vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_set(vma, VM_MIXEDMAP | VM_DONTEXPAND); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); vma->vm_ops = &virtio_gpu_vram_vm_ops; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 265f7c48d856..90097d04b45f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -97,7 +97,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ if (!is_cow_mapping(vma->vm_flags)) - vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; + vm_flags_mod(vma, VM_PFNMAP, VM_MIXEDMAP); ttm_bo_put(bo); /* release extra ref taken by ttm_bo_mmap_obj() */ diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 4c95ebcdcc2d..3ad2b4cfd1f0 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -69,8 +69,7 @@ static int xen_drm_front_gem_object_mmap(struct drm_gem_object *gem_obj, * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map * the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_mod(vma, VM_MIXEDMAP | VM_DONTEXPAND, VM_PFNMAP); vma->vm_pgoff = 0; /* diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index 8069f795c864..daa8e1bff5d9 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1264,7 +1264,7 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma) if (vma_pages(vma) != 1) return -EINVAL; - vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTDUMP | VM_DONTEXPAND); vma->vm_ops = &cs_char_vm_ops; vma->vm_private_data = file->private_data; diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 6c8215a47a60..9621efe0e95c 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1659,7 +1659,7 @@ out: atomic_dec(&msc->user_count); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTCOPY; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &msc_mmap_ops; return ret; } diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 2712e699ba08..534fbefc7f6a 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -715,7 +715,7 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) pm_runtime_get_sync(&stm->dev); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &stm_mmap_vmops; vm_iomap_memory(vma, phys, size); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..c6e59bc480f9 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -403,7 +403,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); addr = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; @@ -528,7 +528,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; + vm_flags_reset(vma, flags); hfi1_cdbg(PROC, "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", ctxt, subctxt, type, mapio, vmf, memaddr, memlen, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c669ef6e47e7..e3c97aa2c46c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2087,7 +2087,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (!dev->mdev->clock_info) return -EOPNOTSUPP; @@ -2311,7 +2311,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 3937144b2ae5..80fe92a21f96 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -733,7 +733,7 @@ static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, } /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; @@ -769,7 +769,7 @@ static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, phys = dd->physaddr + ureg; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, vma->vm_end - vma->vm_start, @@ -810,8 +810,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, * don't allow them to later change to readable with mprotect (for when * not initially mapped readable, as is normally the case) */ - vma->vm_flags &= ~VM_MAYREAD; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND, VM_MAYREAD); /* We used PAT if wc_cookie == 0 */ if (!dd->wc_cookie) @@ -852,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, goto bail; } /* don't allow them to later change to writable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); start = vma->vm_start; @@ -944,7 +943,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, * Don't allow permission to later change to writable * with mprotect. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } else goto bail; len = vma->vm_end - vma->vm_start; @@ -955,7 +954,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); ret = 1; bail: diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6e8c4fbb8083..6289238cc5af 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -672,7 +672,7 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_dbg("\n"); us_ibdev = to_usdev(context->device); - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vfid = vma->vm_pgoff; usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 19176583dbde..9f54aa90a35a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -408,7 +408,7 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) } /* Map UAR to kernel space, VM_LOCKED? */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, start, context->uar.pfn, size, vma->vm_page_prot)) diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 5f1175f8b349..205d3cac425c 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -293,7 +293,7 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 959b45beb1f3..a6c6d2fcaaa4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -185,7 +185,7 @@ static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) /* * Make sure that vm_areas for 2 buffers won't be merged together */ - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index f2c439359557..4c2ec7a0d804 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -314,7 +314,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = map; dev_dbg(q->dev, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 234e9f647c96..53001532e8e3 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -630,8 +630,8 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ + /* using shared anonymous pages */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", map, q, vma->vm_start, vma->vm_end, vma->vm_pgoff, first, last); diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index 9b2443720ab0..85c7090606d6 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -247,7 +247,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index acaa44809c58..76b5ea66dfa1 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -220,7 +220,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma) pr_devel("%s: mmio physical: %llx pe: %i master:%i\n", __func__, ctx->psn_phys, ctx->pe , ctx->master); - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &cxl_mmap_vmops; return 0; diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 5e9ae7600d75..6bb44a3ad5e6 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2082,7 +2082,7 @@ static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, v { struct hl_ts_buff *ts_buff = buf->private; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE); return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0); } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 9f5e208701ba..3b0afdc50ff9 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4236,8 +4236,8 @@ static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index e793fb2bdcbe..65502ec02bc0 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -5538,8 +5538,8 @@ static int gaudi2_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); #ifdef _HAS_DMA_MMAP_COHERENT @@ -10116,8 +10116,8 @@ static int gaudi2_block_mmap(struct hl_device *hdev, struct vm_area_struct *vma, address = pci_resource_start(hdev->pdev, SRAM_CFG_BAR_ID) + offset_in_bar; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, block_size, vma->vm_page_prot); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 0f083fcf81a6..2a15a305d01b 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2880,8 +2880,8 @@ static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 9eb0d93b01c6..7f83116ae11a 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -180,7 +180,7 @@ static int check_mmap_afu_irq(struct ocxl_context *ctx, if ((vma->vm_flags & VM_READ) || (vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_WRITE)) return -EINVAL; - vma->vm_flags &= ~(VM_MAYREAD | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYREAD | VM_MAYEXEC); return 0; } @@ -204,7 +204,7 @@ int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma) if (rc) return rc; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxl_vmops; return 0; diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c index 25c78df8055d..405180d47d9b 100644 --- a/drivers/misc/ocxl/sysfs.c +++ b/drivers/misc/ocxl/sysfs.c @@ -134,7 +134,7 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, (afu->config.global_mmio_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &global_mmio_vmops; vma->vm_private_data = afu; diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 9dda47b3fd70..8aea2d070a40 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -95,12 +95,12 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTDUMP); return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); } diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 7ffcfc0bb587..a3d659c11cc4 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -101,8 +101,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | - VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index 905eff1f840e..b65ab440a19e 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -229,7 +229,7 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma) if (!qfr) return -ENOMEM; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK); vma->vm_ops = &uacce_vm_ops; vma->vm_private_data = q; qfr->type = type; diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 21b7cb6e7e70..e300cf26bc2a 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -389,7 +389,7 @@ static int dax_devmap(struct file *f, struct vm_area_struct *vma) /* completion area is mapped read-only for user */ if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (remap_pfn_range(vma, vma->vm_start, ctx->ca_buf_ra >> PAGE_SHIFT, len, vma->vm_page_prot)) diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c index 631eda2d467e..6542818e595a 100644 --- a/drivers/scsi/cxlflash/ocxl_hw.c +++ b/drivers/scsi/cxlflash/ocxl_hw.c @@ -1167,7 +1167,7 @@ static int afu_mmap(struct file *file, struct vm_area_struct *vma) (ctx->psn_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxlflash_vmops; return 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ff9854f59964..a91049213203 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1288,7 +1288,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; out: diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 5e53eed8ae95..095cd0ba8c21 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -1072,7 +1072,7 @@ int hmm_bo_mmap(struct vm_area_struct *vma, struct hmm_buffer_object *bo) vma->vm_private_data = bo; vma->vm_ops = &hmm_bo_vm_ops; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); /* * call hmm_bo_vm_open explicitly. diff --git a/drivers/staging/media/deprecated/meye/meye.c b/drivers/staging/media/deprecated/meye/meye.c index 5d87efd9b95c..746c6ea1c0a7 100644 --- a/drivers/staging/media/deprecated/meye/meye.c +++ b/drivers/staging/media/deprecated/meye/meye.c @@ -1476,8 +1476,8 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &meye_vm_ops; - vma->vm_flags &= ~VM_IO; /* not I/O memory */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + /* not I/O memory */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = (void *) (offset / gbufsize); meye_vm_open(vma); diff --git a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c index 787edb3d47c2..a1b7ad350a90 100644 --- a/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c +++ b/drivers/staging/media/deprecated/stkwebcam/stk-webcam.c @@ -779,7 +779,7 @@ static int v4l_stk_mmap(struct file *fp, struct vm_area_struct *vma) ret = remap_vmalloc_range(vma, sbuf->buffer, 0); if (ret) return ret; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = sbuf; vma->vm_ops = &stk_v4l_vm_ops; sbuf->v4lbuf.flags |= V4L2_BUF_FLAG_MAPPED; diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2940559c3086..15ffc8d2ac7b 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1928,7 +1928,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &tcmu_vm_ops; vma->vm_private_data = udev; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 43afbb7c5ab9..62082d64ece0 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -713,7 +713,7 @@ static const struct vm_operations_struct uio_logical_vm_ops = { static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &uio_logical_vm_ops; return 0; } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 837f3e57f580..e501a03d6c70 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -279,8 +279,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) } } - vma->vm_flags |= VM_IO; - vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &usbdev_vm_ops; vma->vm_private_data = usbm; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 094e812e9e69..abb1cd35d8a6 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1272,8 +1272,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index e682bc7ee6c9..5e4a77b9bae6 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -512,7 +512,7 @@ static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) { struct vduse_iova_domain *domain = file->private_data; - vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); vma->vm_private_data = domain; vma->vm_ops = &vduse_domain_mmap_ops; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 26a541cc64d1..c49f8f2b2865 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1799,7 +1799,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ec32f785dfde..9c5010ee20da 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1315,7 +1315,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c index 7db03ed77c76..41df61b37a18 100644 --- a/drivers/video/fbdev/68328fb.c +++ b/drivers/video/fbdev/68328fb.c @@ -391,7 +391,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_start = videomemory; return 0; diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index c730253ab85c..dc310c7b5769 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -232,9 +232,9 @@ static const struct address_space_operations fb_deferred_io_aops = { int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a15729beb9d1..26ffb8755ffb 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -525,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4d9a3050de6a..61faea1f0663 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1055,10 +1055,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; if (map->flags) { diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index dd5bbb6e1b6b..2fa10ca5be14 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -156,7 +156,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) vma_priv->file_priv = file_priv; vma_priv->users = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); vma->vm_ops = &privcmd_buf_vm_ops; vma->vm_private_data = vma_priv; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1edf45ee9890..e2f580e30a86 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -934,8 +934,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/aio.c b/fs/aio.c index 562916d85cba..5a88caf52be4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -390,7 +390,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 50e4e060db68..45a65c400bd0 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -408,7 +408,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) * unpopulated ptes via cramfs_read_folio(). */ int i; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..f32d65987578 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -429,7 +429,7 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else diff --git a/fs/exec.c b/fs/exec.c index c0df813d2b45..d2e2a15e5cfe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); @@ -834,7 +834,7 @@ int setup_arg_pages(struct linux_binprm *bprm, } /* mprotect_fixup is overkill to remove the temporary stack flags */ - vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd371..6bdf61a62c79 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -801,7 +801,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a8013..8e74f278a3f6 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -860,7 +860,7 @@ int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 48f1a8ad2243..44ecdcb796cc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -132,7 +132,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_future_write(info->seals, vma); @@ -811,7 +811,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ vma_init(&pseudo_vma, mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..a5e1ea8b7119 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -389,8 +389,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vma->vm_flags |= VM_SEQ_READ; - vma->vm_flags &= ~VM_RAND_READ; + vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); file_accessed(file); vma->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a944e1816364..6a96e1713fd5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1299,7 +1299,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b1273..12af614f33ce 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -582,8 +582,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f3c75c6222de..44d1ee429eb0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -113,7 +113,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - vma->vm_flags = flags; + vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 595a5bcf46b9..b0039a8fea2e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,7 +1429,7 @@ xfs_file_mmap( file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 663726ca2240..ce6d9d765aae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3653,7 +3653,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); + vm_flags_clear(vma, VM_MAYWRITE); } return 0; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aaf..8732e0aadf36 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma */ return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64131f88c553..9f56b442daa9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -882,10 +882,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; - vma->vm_flags &= ~VM_MAYEXEC; + vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..55a82f12a42c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6573,7 +6573,7 @@ aux_unlock: * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a05..84c717337df0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) goto exit; } spin_unlock_irqrestore(&kcov->lock, flags); - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9..9aa70ae53d24 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = buf; return 0; diff --git a/mm/madvise.c b/mm/madvise.c index ca672e37b38c..5a5a687d03c2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -176,7 +176,7 @@ success: /* * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) diff --git a/mm/memory.c b/mm/memory.c index 029f838587d1..4354b7987f36 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1928,7 +1928,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); @@ -1986,7 +1986,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot); } @@ -2452,7 +2452,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index 5c4fff93cd6b..ed49459e343e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); } } @@ -457,7 +457,7 @@ success: if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } diff --git a/mm/mmap.c b/mm/mmap.c index 03d7c37c5969..33c638c7ec04 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2555,7 +2555,7 @@ cannot_expand: vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -2683,7 +2683,7 @@ expanded: * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); @@ -2909,7 +2909,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vma_prepare(&vp); vma->vm_end = addr + len; - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, mm); @@ -2926,7 +2926,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = addr >> PAGE_SHIFT; - vma->vm_flags = flags; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2938,7 +2938,7 @@ out: mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); validate_mm(mm); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index cce6a0e58fb5..1d4843c97c2a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -670,7 +670,7 @@ success: * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); diff --git a/mm/mremap.c b/mm/mremap.c index d70d8063c6e2..411a85682b58 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -662,7 +662,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { - vma->vm_flags &= ~VM_ACCOUNT; + vm_flags_clear(vma, VM_ACCOUNT); if (vma->vm_start < old_addr) account_start = vma->vm_start; if (vma->vm_end > old_addr + old_len) @@ -719,12 +719,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (account_start) { vma = vma_prev(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } if (account_end) { vma = vma_next(&vmi); - vma->vm_flags |= VM_ACCOUNT; + vm_flags_set(vma, VM_ACCOUNT); } return new_addr; diff --git a/mm/nommu.c b/mm/nommu.c index 9a166738909e..57ba243c6a37 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -173,7 +173,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) - vma->vm_flags |= VM_USERMAP; + vm_flags_set(vma, VM_USERMAP); mmap_write_unlock(current->mm); } @@ -950,7 +950,8 @@ static int do_mmap_private(struct vm_area_struct *vma, atomic_long_add(total, &mmap_pages_allocated); - region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); + region->vm_flags = vma->vm_flags; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); @@ -1047,7 +1048,7 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -1111,7 +1112,7 @@ unsigned long do_mmap(struct file *file, vma->vm_end = start + len; if (pregion->vm_flags & VM_MAPPED_COPY) - vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); else { ret = do_mmap_shared_file(vma); if (ret < 0) { @@ -1601,7 +1602,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/secretmem.c b/mm/secretmem.c index be3fff86ba00..8453ada8f41d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -128,7 +128,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 9e1015cbad29..732969afabd1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2304,7 +2304,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return ret; /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; + vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9b71ec3213cb..ff4d7dfdf84a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3596,7 +3596,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f713c0422f0f..7db45cdc3e1a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1890,10 +1890,10 @@ int tcp_mmap(struct file *file, struct socket *sock, { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0a6894cdc54d..18498979a640 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -262,7 +262,7 @@ static int sel_mmap_handle_status(struct file *filp, if (vma->vm_flags & VM_WRITE) return -EPERM; /* disallow mprotect() turns it into writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return remap_pfn_range(vma, vma->vm_start, page_to_pfn(status), @@ -506,13 +506,13 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) { /* do not allow mprotect to make mapping writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (vma->vm_flags & VM_WRITE) return -EACCES; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ac2efeb63a39..728c211142d1 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -2910,7 +2910,7 @@ static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ - area->vm_flags |= VM_READ; + vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9c122e757efe..331380c2438b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3675,8 +3675,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - area->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP, + VM_WRITE | VM_MAYWRITE); + return 0; } @@ -3712,7 +3713,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -3828,7 +3829,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); if (!substream->ops->page && !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area)) return 0; diff --git a/sound/soc/pxa/mmp-sspa.c b/sound/soc/pxa/mmp-sspa.c index fb5a4390443f..b3c1744eff91 100644 --- a/sound/soc/pxa/mmp-sspa.c +++ b/sound/soc/pxa/mmp-sspa.c @@ -404,7 +404,7 @@ static int mmp_pcm_mmap(struct snd_soc_component *component, struct snd_pcm_substream *substream, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, substream->dma_buffer.addr >> PAGE_SHIFT, diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index e558931cce16..709ccad972e2 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -224,9 +224,9 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_DONTDUMP; + vm_flags_set(area, VM_DONTDUMP); if (!read) - area->vm_flags |= VM_DONTEXPAND; + vm_flags_set(area, VM_DONTEXPAND); area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index c29da0341bc5..4937ede0b5d7 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -61,7 +61,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep *hw, struct file *filp, struct vm } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 767a227d54da..36f2e31168fb 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -706,7 +706,7 @@ static int snd_usx2y_hwdep_pcm_mmap(struct snd_hwdep *hw, struct file *filp, str return -ENODEV; area->vm_ops = &snd_usx2y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } From ff126c0ece69de1c12d2f6e77ec77df997dd19e6 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:50 -0800 Subject: [PATCH 393/505] mm: replace vma->vm_flags indirect modification in ksm_madvise Replace indirect modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. Link: https://lkml.kernel.org/r/20230126193752.297968-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Michael Ellerman [powerpc] Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 6 +++++- arch/s390/mm/gmap.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 1d67baa5557a..709ebd578394 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, { unsigned long gfn = memslot->base_gfn; unsigned long end, start = gfn_to_hva(kvm, gfn); + unsigned long vm_flags; int ret = 0; struct vm_area_struct *vma; int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; @@ -409,12 +410,15 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - merge_flag, &vma->vm_flags); + merge_flag, &vm_flags); if (ret) { ret = H_STATE; break; } + vm_flags_reset(vma, vm_flags); start = vma->vm_end; } while (end > vma->vm_end); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ab836597419d..5a716bdcba05 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2587,14 +2587,18 @@ int gmap_mark_unmergeable(void) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + unsigned long vm_flags; int ret; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); + MADV_UNMERGEABLE, &vm_flags); if (ret) return ret; + vm_flags_reset(vma, vm_flags); } mm->def_flags &= ~VM_MERGEABLE; return 0; From 68f48381d7fdd1cbb9d88c37a4dfbb98ac78226d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:51 -0800 Subject: [PATCH 394/505] mm: introduce __vm_flags_mod and use it in untrack_pfn There are scenarios when vm_flags can be modified without exclusive mmap_lock, such as: - after VMA was isolated and mmap_lock was downgraded or dropped - in exit_mmap when there are no other mm users and locking is unnecessary Introduce __vm_flags_mod to avoid assertions when the caller takes responsibility for the required locking. Pass a hint to untrack_pfn to conditionally use __vm_flags_mod for flags modification to avoid assertion. Link: https://lkml.kernel.org/r/20230126193752.297968-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/pat/memtype.c | 10 +++++++--- include/linux/mm.h | 14 ++++++++++++-- include/linux/pgtable.h | 5 +++-- mm/memory.c | 13 +++++++------ mm/memremap.c | 4 ++-- mm/mmap.c | 16 ++++++++++------ 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 6ca51b1aa5d9..691bf8934b6f 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) + unsigned long size, bool mm_wr_locked) { resource_size_t paddr; unsigned long prot; @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - if (vma) - vm_flags_clear(vma, VM_PAT); + if (vma) { + if (mm_wr_locked) + vm_flags_clear(vma, VM_PAT); + else + __vm_flags_mod(vma, 0, VM_PAT); + } } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index ce6d9d765aae..27b34f7730e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -656,6 +656,16 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. @@ -664,7 +674,7 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { mmap_assert_write_locked(vma->vm_mm); - vm_flags_init(vma, (vma->vm_flags | set) & ~clear); + __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) @@ -2085,7 +2095,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5fd45454c073..c63cd44777ec 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long size, + bool mm_wr_locked) { } @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); + unsigned long size, bool mm_wr_locked); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif diff --git a/mm/memory.c b/mm/memory.c index 4354b7987f36..ce1d84d022d3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) + struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0); + untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1675,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr) + unsigned long end_addr, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1689,7 +1689,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { - unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details, + mm_wr_locked); } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1723,7 +1724,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details); + unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -2492,7 +2493,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/memremap.c b/mm/memremap.c index 08cbf54fe037..2f88f43d4a01 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -129,7 +129,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); pgmap_array_delete(range); } @@ -276,7 +276,7 @@ err_add_memory: if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); err_pfn_remap: pgmap_array_delete(range); return error; diff --git a/mm/mmap.c b/mm/mmap.c index 33c638c7ec04..20f21f0949dd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -78,7 +78,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -2133,14 +2133,14 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool mm_wr_locked) { struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); @@ -2388,7 +2388,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade); /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); @@ -2701,7 +2705,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, - vma->vm_end); + vma->vm_end, true); } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); @@ -3029,7 +3033,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); mmap_read_unlock(mm); /* From c2fdc235300a027adc04a41b383bd78ab5da56f4 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:52 -0800 Subject: [PATCH 395/505] mm: export dump_mm() mmap_assert_write_locked() is used in vm_flags modifiers. Because mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes modified from inside a module, it's necessary to export dump_mm() function. Link: https://lkml.kernel.org/r/20230126193752.297968-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Sebastian Reichel Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/debug.c b/mm/debug.c index 9d3d893dc7f4..96d594e16292 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm) mm->def_flags, &mm->def_flags ); } +EXPORT_SYMBOL(dump_mm); static bool page_init_poisoning __read_mostly = true; From 8f17febb34ceb464e3ff99e9436d0ae3f47b4862 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Sun, 29 Jan 2023 10:14:35 +0800 Subject: [PATCH 396/505] kasan: infer allocation size by scanning metadata Make KASAN scan metadata to infer the requested allocation size instead of printing cache->object_size. This patch fixes confusing slab-out-of-bounds reports as reported in: https://bugzilla.kernel.org/show_bug.cgi?id=216457 As an example of the confusing behavior, the report below hints that the allocation size was 192, while the kernel actually called kmalloc(184): ================================================================== BUG: KASAN: slab-out-of-bounds in _find_next_bit+0x143/0x160 lib/find_bit.c:109 Read of size 8 at addr ffff8880175766b8 by task kworker/1:1/26 ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 184 bytes inside of 192-byte region [ffff888017576600, ffff8880175766c0) ... Memory state around the buggy address: ffff888017576580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888017576600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888017576680: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc ^ ffff888017576700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888017576780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== With this patch, the report shows: ================================================================== ... The buggy address belongs to the object at ffff888017576600 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 0 bytes to the right of allocated 184-byte region [ffff888017576600, ffff8880175766b8) ... ================================================================== Also report slab use-after-free bugs as "slab-use-after-free" and print "freed" instead of "allocated" in the report when describing the accessed memory region. Also improve the metadata-related comment in kasan_find_first_bad_addr and use addr_has_metadata across KASAN code instead of open-coding KASAN_SHADOW_START checks. [akpm@linux-foundation.org: fix printk warning] Link: https://bugzilla.kernel.org/show_bug.cgi?id=216457 Link: https://lkml.kernel.org/r/20230129021437.18812-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Co-developed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Chinwen Chang Cc: Dmitry Vyukov Cc: Matthias Brugger Cc: Qun-Wei Lin Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 4 +--- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 41 ++++++++++++++++++++++++++++----------- mm/kasan/report_generic.c | 32 +++++++++++++++++++++++++++++- mm/kasan/report_hw_tags.c | 35 ++++++++++++++++++++++++++++++++- mm/kasan/report_sw_tags.c | 26 +++++++++++++++++++++++++ mm/kasan/report_tags.c | 2 +- mm/kasan/sw_tags.c | 6 ++---- 8 files changed, 127 insertions(+), 21 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b076f597a378..a37b5b57bf5c 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -172,10 +172,8 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata((void *)addr))) return !kasan_report(addr, size, write, ret_ip); - } if (likely(!memory_is_poisoned(addr, size))) return true; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 32413f22aa82..308fb70fd40a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -207,6 +207,7 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + size_t alloc_size; /* Filled in by the mode-specific reporting code. */ const char *bug_type; @@ -323,6 +324,7 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *kasan_find_first_bad_addr(void *addr, size_t size); +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 22598b20c7b7..89078f912827 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -231,33 +231,46 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(const void *addr, struct kmem_cache *cache, - void *object) +static void describe_object_addr(const void *addr, struct kasan_report_info *info) { unsigned long access_addr = (unsigned long)addr; - unsigned long object_addr = (unsigned long)object; - const char *rel_type; + unsigned long object_addr = (unsigned long)info->object; + const char *rel_type, *region_state = ""; int rel_bytes; pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", - object, cache->name, cache->object_size); + info->object, info->cache->name, info->cache->object_size); if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; - } else if (access_addr >= object_addr + cache->object_size) { + } else if (access_addr >= object_addr + info->alloc_size) { rel_type = "to the right"; - rel_bytes = access_addr - (object_addr + cache->object_size); + rel_bytes = access_addr - (object_addr + info->alloc_size); } else { rel_type = "inside"; rel_bytes = access_addr - object_addr; } + /* + * Tag-Based modes use the stack ring to infer the bug type, but the + * memory region state description is generated based on the metadata. + * Thus, defining the region state as below can contradict the metadata. + * Fixing this requires further improvements, so only infer the state + * for the Generic mode. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { + if (strcmp(info->bug_type, "slab-out-of-bounds") == 0) + region_state = "allocated "; + else if (strcmp(info->bug_type, "slab-use-after-free") == 0) + region_state = "freed "; + } + pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%px, %px)\n", - rel_bytes, rel_type, cache->object_size, (void *)object_addr, - (void *)(object_addr + cache->object_size)); + " %s%zu-byte region [%px, %px)\n", + rel_bytes, rel_type, region_state, info->alloc_size, + (void *)object_addr, (void *)(object_addr + info->alloc_size)); } static void describe_object_stacks(struct kasan_report_info *info) @@ -279,7 +292,7 @@ static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) describe_object_stacks(info); - describe_object_addr(addr, info->cache, info->object); + describe_object_addr(addr, info); } static inline bool kernel_or_module_addr(const void *addr) @@ -436,6 +449,12 @@ static void complete_report_info(struct kasan_report_info *info) if (slab) { info->cache = slab->slab_cache; info->object = nearest_obj(info->cache, slab, addr); + + /* Try to determine allocation size based on the metadata. */ + info->alloc_size = kasan_get_alloc_size(info->object, info->cache); + /* Fallback to the object size if failed. */ + if (!info->alloc_size) + info->alloc_size = info->cache->object_size; } else info->cache = info->object = NULL; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 043c94b04605..87d39bc0a673 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -43,6 +43,34 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow == 0) + size += KASAN_GRANULE_SIZE; + else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1) + return size + *shadow; + else + return size; + shadow++; + } + + return cache->object_size; +} + static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -79,9 +107,11 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) bug_type = "stack-out-of-bounds"; break; case KASAN_PAGE_FREE: + bug_type = "use-after-free"; + break; case KASAN_SLAB_FREE: case KASAN_SLAB_FREETRACK: - bug_type = "use-after-free"; + bug_type = "slab-use-after-free"; break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index f3d3be614e4b..32e80f78de7d 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,10 +17,43 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { - /* Return the same value regardless of whether addr_has_metadata(). */ + /* + * Hardware Tag-Based KASAN only calls this function for normal memory + * accesses, and thus addr points precisely to the first bad address + * with an invalid (and present) memory tag. Therefore: + * 1. Return the address as is without walking memory tags. + * 2. Skip the addr_has_metadata check. + */ return kasan_reset_tag(addr); } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + int i = 0; + u8 memory_tag; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + while (size < cache->object_size) { + memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE); + if (memory_tag != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + i++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { int i; diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 7a26397297ed..8b1f5a73ee6d 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -45,6 +45,32 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + shadow++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index ecede06ef374..8b8bfdb3cfdb 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -89,7 +89,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * a use-after-free. */ if (!info->bug_type) - info->bug_type = "use-after-free"; + info->bug_type = "slab-use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index a3afaf2ad1b1..30da65fa02a1 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -106,10 +106,8 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; untagged_addr = kasan_reset_tag((const void *)addr); - if (unlikely(untagged_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata(untagged_addr))) return !kasan_report(addr, size, write, ret_ip); - } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { @@ -127,7 +125,7 @@ bool kasan_byte_accessible(const void *addr) void *untagged_addr = kasan_reset_tag(addr); u8 shadow_byte; - if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + if (!addr_has_metadata(untagged_addr)) return false; shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); From 93419139fa14124c1c507d804f2b28866ebee28d Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Sun, 29 Jan 2023 04:06:51 +0000 Subject: [PATCH 397/505] memory tier: release the new_memtier in find_create_memory_tier() In find_create_memory_tier(), if failed to register device, then we should release new_memtier from the tier list and put device instead of memtier. Link: https://lkml.kernel.org/r/20230129040651.1329208-1-tongtiangen@huawei.com Fixes: 9832fb87834e ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: Tong Tiangen Cc: Aneesh Kumar K.V Cc: Hanjun Guo Cc: Kefeng Wang Cc: Guohanjun Cc: Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c734658c6242..e593e56e530b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -211,8 +211,8 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty ret = device_register(&new_memtier->dev); if (ret) { - list_del(&memtier->list); - put_device(&memtier->dev); + list_del(&new_memtier->list); + put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; From 6069b9ec8c0fcd5ee09167020e9e5e673fce93f8 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:32 +0200 Subject: [PATCH 398/505] arm: include asm-generic/memory_model.h from page.h rather than memory.h Patch series "mm, arch: add generic implementation of pfn_valid() for FLATMEM", v2. Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions This patch (of 4): Makes it consistent with other architectures and allows for generic definition of pfn_valid() in asm-generic/memory_model.h with clear override in arch/arm/include/asm/page.h Link: https://lkml.kernel.org/r/20230129124235.209895-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20230129124235.209895-2-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/include/asm/memory.h | 2 -- arch/arm/include/asm/page.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index d8eef4bd8c71..62e9df024445 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -386,6 +386,4 @@ static inline unsigned long __virt_to_idmap(unsigned long x) #endif -#include - #endif diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 5fcc8a600e36..74bb5947b387 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -158,6 +158,7 @@ typedef struct page *pgtable_t; #ifdef CONFIG_HAVE_ARCH_PFN_VALID extern int pfn_valid(unsigned long); +#define pfn_valid pfn_valid #endif #include @@ -167,5 +168,6 @@ extern int pfn_valid(unsigned long); #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC #include +#include #endif From d82f07f06cf85b4fe4560ee7c1a460591db840df Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:33 +0200 Subject: [PATCH 399/505] m68k: use asm-generic/memory_model.h for both MMU and !MMU The MMU variant uses generic definitions of page_to_pfn() and pfn_to_page(), but !MMU defines them in include/asm/page_no.h for no good reason. Include asm-generic/memory_model.h in the common include/asm/page.h and drop redundant definitions. Link: https://lkml.kernel.org/r/20230129124235.209895-3-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/include/asm/page.h | 6 +----- arch/m68k/include/asm/page_mm.h | 1 - arch/m68k/include/asm/page_no.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h index 2f1c54e4725d..a5993ad83ed8 100644 --- a/arch/m68k/include/asm/page.h +++ b/arch/m68k/include/asm/page.h @@ -62,11 +62,7 @@ extern unsigned long _ramend; #include #endif -#ifndef CONFIG_MMU -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) PFN_PHYS(pfn) -#endif - #include +#include #endif /* _M68K_PAGE_H */ diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index a5b459bcb7d8..3903db2e8da7 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -134,7 +134,6 @@ extern int m68k_virt_to_node_shift; }) #define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) -#include #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index abd2c3aeb015..83d345f482bd 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn)) -#define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ From c2524a6b7de1e8d2767ddc1ee43ee241a580e677 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:34 +0200 Subject: [PATCH 400/505] mips: drop definition of pfn_valid() for DISCONTIGMEM There is a stale definition of pfn_valid() for DISCONTINGMEM memory model guarded !FLATMEM && !SPARSEMEM && NUMA ifdefery. Remove everything but definition of pfn_valid() for FLATMEM. Link: https://lkml.kernel.org/r/20230129124235.209895-4-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/include/asm/page.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 96bc798c1ec1..9286f11ff6ad 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -235,21 +235,6 @@ static inline int pfn_valid(unsigned long pfn) return pfn >= pfn_offset && pfn < max_mapnr; } -#elif defined(CONFIG_SPARSEMEM) - -/* pfn_valid is defined in linux/mmzone.h */ - -#elif defined(CONFIG_NUMA) - -#define pfn_valid(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - int __n = pfn_to_nid(__pfn); \ - ((__n >= 0) ? (__pfn < NODE_DATA(__n)->node_start_pfn + \ - NODE_DATA(__n)->node_spanned_pages) \ - : 0); \ -}) - #endif #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) From e5080a9677854bdd82383713cba168c1b13e46ba Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 29 Jan 2023 14:42:35 +0200 Subject: [PATCH 401/505] mm, arch: add generic implementation of pfn_valid() for FLATMEM Every architecture that supports FLATMEM memory model defines its own version of pfn_valid() that essentially compares a pfn to max_mapnr. Use mips/powerpc version implemented as static inline as a generic implementation of pfn_valid() and drop its per-architecture definitions. [rppt@kernel.org: fix the generic pfn_valid()] Link: https://lkml.kernel.org/r/Y9lg7R1Yd931C+y5@kernel.org Link: https://lkml.kernel.org/r/20230129124235.209895-5-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Acked-by: Arnd Bergmann Acked-by: Guo Ren [csky] Acked-by: Huacai Chen [LoongArch] Acked-by: Stafford Horne [OpenRISC] Acked-by: Michael Ellerman [powerpc] Reviewed-by: David Hildenbrand Tested-by: Conor Dooley Cc: Brian Cain Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Helge Deller Cc: Huacai Chen Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 4 ---- arch/arc/include/asm/page.h | 1 - arch/csky/include/asm/page.h | 1 - arch/hexagon/include/asm/page.h | 1 - arch/ia64/include/asm/page.h | 4 ---- arch/loongarch/include/asm/page.h | 13 ------------- arch/m68k/include/asm/page_no.h | 2 -- arch/microblaze/include/asm/page.h | 1 - arch/mips/include/asm/page.h | 13 ------------- arch/nios2/include/asm/page.h | 9 --------- arch/openrisc/include/asm/page.h | 2 -- arch/parisc/include/asm/page.h | 4 ---- arch/powerpc/include/asm/page.h | 9 --------- arch/riscv/include/asm/page.h | 5 ----- arch/sh/include/asm/page.h | 3 --- arch/sparc/include/asm/page_32.h | 1 - arch/um/include/asm/page.h | 1 - arch/x86/include/asm/page_32.h | 4 ---- arch/x86/include/asm/page_64.h | 4 ---- arch/xtensa/include/asm/page.h | 4 ++-- include/asm-generic/memory_model.h | 12 ++++++++++++ include/asm-generic/page.h | 2 -- 22 files changed, 14 insertions(+), 86 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index bc5256fba8f0..4db1ebc0ed99 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -86,10 +86,6 @@ typedef struct page *pgtable_t; #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid((__pa(kaddr) >> PAGE_SHIFT)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include #include diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9a62e1d87967..e43fe27ec54d 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -109,7 +109,6 @@ extern int pfn_valid(unsigned long pfn); #else /* CONFIG_HIGHMEM */ #define ARCH_PFN_OFFSET virt_to_pfn(CONFIG_LINUX_RAM_BASE) -#define pfn_valid(pfn) (((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #endif /* CONFIG_HIGHMEM */ diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index ed7451478b1b..b23e3006a9e0 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -39,7 +39,6 @@ #define virt_addr_valid(kaddr) ((void *)(kaddr) >= (void *)PAGE_OFFSET && \ (void *)(kaddr) < high_memory) -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) extern void *memset(void *dest, int c, size_t l); extern void *memcpy(void *to, const void *from, size_t l); diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index d7d4f9fca327..9c03b9965f07 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -95,7 +95,6 @@ struct page; /* Default vm area behavior is non-executable. */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) /* Need to not use a define for linesize; may move this to another file. */ diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index ba0b365cf2b2..310b09c3342d 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -95,10 +95,6 @@ do { \ #include -#ifdef CONFIG_FLATMEM -# define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 53f284a96182..fb5338b352e6 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -82,19 +82,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 83d345f482bd..43ff6b109ebb 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -25,8 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 4b8b2fa78fc5..7b9861bcd458 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,7 +112,6 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 9286f11ff6ad..5978a8dfb917 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -224,19 +224,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 6a989819a7c1..0ae7d9ce369b 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -86,15 +86,6 @@ extern struct page *mem_map; # define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -static inline bool pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index aab6e64d6db4..52b0d7e76446 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -80,8 +80,6 @@ typedef struct page *pgtable_t; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (pfn_valid(virt_to_pfn(kaddr))) #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 6faaaa3ebe9b..667e703c0e8f 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -155,10 +155,6 @@ extern int npmem_ranges; #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_SPARSEMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PMD_SHIFT /* fixed for transparent huge pages */ #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index edf1dd1b0ca9..f2b6bf5687d0 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -117,15 +117,6 @@ extern long long virt_phys_offset; #ifdef CONFIG_FLATMEM #define ARCH_PFN_OFFSET ((unsigned long)(MEMORY_START >> PAGE_SHIFT)) -#ifndef __ASSEMBLY__ -extern unsigned long max_mapnr; -static inline bool pfn_valid(unsigned long pfn) -{ - unsigned long min_pfn = ARCH_PFN_OFFSET; - - return pfn >= min_pfn && pfn < max_mapnr; -} -#endif #endif #define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b5289..7fed7c431928 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -171,11 +171,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) \ - (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) -#endif - #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index eca5daa43b93..09ac6c7faee0 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -169,9 +169,6 @@ typedef struct page *pgtable_t; #define PFN_START (__MEMORY_START >> PAGE_SHIFT) #define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) >= min_low_pfn && (pfn) < max_low_pfn) -#endif #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index fff8861df107..6be6f683f98f 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -130,7 +130,6 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) #define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr) #include diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index cdbd9653aa14..84866127d074 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -108,7 +108,6 @@ extern unsigned long uml_physmem; #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) #define pfn_to_phys(pfn) PFN_PHYS(pfn) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index df42f8aa99e4..580d71aca65a 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long); #define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include static inline void clear_page(void *page) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 198e03e59ca1..cc6b8e087192 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 493eb7083b1a..a77d04972eb9 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -11,6 +11,8 @@ #ifndef _XTENSA_PAGE_H #define _XTENSA_PAGE_H +#include + #include #include #include @@ -189,8 +191,6 @@ static inline unsigned long ___pa(unsigned long va) #endif #define __va(x) \ ((void *)((unsigned long) (x) - PHYS_OFFSET + PAGE_OFFSET)) -#define pfn_valid(pfn) \ - ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a2c8ed60233a..6796abe1900e 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,6 +19,18 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +#ifndef pfn_valid +static inline int pfn_valid(unsigned long pfn) +{ + /* avoid include hell */ + extern unsigned long max_mapnr; + unsigned long pfn_offset = ARCH_PFN_OFFSET; + + return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; +} +#define pfn_valid pfn_valid +#endif + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) /* memmap is virtually contiguous. */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index 6fc47561814c..c0be2edeb484 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -84,8 +84,6 @@ extern unsigned long memory_end; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #endif -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) - #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) From 1d693a3e69ba786c6c263d51b8e6d0daf16723ae Mon Sep 17 00:00:00 2001 From: Longlong Xia Date: Tue, 31 Jan 2023 07:10:35 +0000 Subject: [PATCH 402/505] mm/swapfile: remove pr_debug in get_swap_pages() It's known that get_swap_pages() may fail to find available space under some extreme case, but pr_debug() provides useless information. Let's remove it. Link: https://lkml.kernel.org/r/20230131071035.1085968-1-xialonglong1@huawei.com Signed-off-by: Longlong Xia Reviewed-by: "Huang, Ying" Cc: Chen Wandun Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 888aed774fb6..cf0d72020f97 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1098,8 +1098,6 @@ start_over: spin_unlock(&si->lock); if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; - pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); cond_resched(); spin_lock(&swap_avail_lock); From 7e4a32c0e8adafdda6161635c9046e6c1e8b95b5 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 1 Feb 2023 20:51:42 +0900 Subject: [PATCH 403/505] mm/vmalloc: replace BUG_ON with a simple if statement As per the coding standards, in the event of an abnormal condition that should not occur under normal circumstances, the kernel should attempt recovery and proceed with execution, rather than halting the machine. Specifically, in the alloc_vmap_area() function, use a simple if() instead of using BUG_ON() halting the machine. Link: https://lkml.kernel.org/r/20230201115142.GA7772@min-iamroot Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Signed-off-by: Hyunmin Lee Reviewed-by: Christophe Leroy Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ff4d7dfdf84a..1dd7ca258a76 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1586,9 +1586,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int purged = 0; int ret; - BUG_ON(!size); - BUG_ON(offset_in_page(size)); - BUG_ON(!is_power_of_2(align)); + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); From 601c3c29dbeb049862faa00917f2daf094a71028 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 Jan 2023 16:01:16 -0800 Subject: [PATCH 404/505] mm: introduce vm_flags_reset_once to replace WRITE_ONCE vm_flags updates Provide vm_flags_reset_once() and replace the vm_flags updates which used WRITE_ONCE() to prevent compiler optimizations. Link: https://lkml.kernel.org/r/20230201000116.1333160-1-surenb@google.com Fixes: 0cce31a0aa0e ("mm: replace vma->vm_flags direct modifications with modifier calls") Signed-off-by: Suren Baghdasaryan Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ mm/mlock.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 27b34f7730e7..0ed0cb2401f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -642,6 +642,13 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { diff --git a/mm/mlock.c b/mm/mlock.c index ed49459e343e..617469fce96d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); } } From aa02d3c174abfc506058d3e43f471bc4280563d0 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Fri, 3 Feb 2023 18:01:32 +0800 Subject: [PATCH 405/505] mm/page_alloc: reduce fallbacks to (MIGRATE_PCPTYPES - 1) The commit 1dd214b8f21c ("mm: page_alloc: avoid merging non-fallbackable pageblocks with others") has removed MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list. so there is no need to add an element at the end of every type. Reduce fallbacks to (MIGRATE_PCPTYPES - 1). Link: https://lkml.kernel.org/r/20230203100132.1627787-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Mel Gorman Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ebce58026f1..21d820c42900 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2603,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, }; #ifdef CONFIG_CMA @@ -2865,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; *can_steal = false; - for (i = 0;; i++) { + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_TYPES) - break; - if (free_area_empty(area, fallback_mt)) continue; From 3e629597b8477efbcc0ad14ee80558a080eebdc3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 16:25:19 +0000 Subject: [PATCH 406/505] filemap: add mapping_read_folio_gfp() This is like read_cache_page_gfp() except it returns the folio instead of the precise page. Link: https://lkml.kernel.org/r/20230206162520.4029022-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Mark Hemment Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9f1081683771..6a32ac170d3d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -756,6 +756,8 @@ static inline struct page *grab_cache_page(struct address_space *mapping, struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); +struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index, + gfp_t flags); struct page *read_cache_page(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 992554c18f1f..2ebcf500871d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3585,6 +3585,30 @@ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, } EXPORT_SYMBOL(read_cache_folio); +/** + * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. + * @mapping: The address_space for the folio. + * @index: The index that the allocated folio will contain. + * @gfp: The page allocator flags to use if allocating. + * + * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with + * any new memory allocations done using the specified allocation flags. + * + * The most likely error from this function is EIO, but ENOMEM is + * possible and so is EINTR. If ->read_folio returns another error, + * that will be returned to the caller. + * + * The function expects mapping->invalidate_lock to be already held. + * + * Return: Uptodate folio on success, ERR_PTR() on failure. + */ +struct folio *mapping_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return do_read_cache_folio(mapping, index, NULL, NULL, gfp); +} +EXPORT_SYMBOL(mapping_read_folio_gfp); + static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { From f01b2b3ed8735dacd92f1da548708449525e286a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 16:25:20 +0000 Subject: [PATCH 407/505] shmem: add shmem_read_folio() and shmem_read_folio_gfp() These are the folio replacements for shmem_read_mapping_page() and shmem_read_mapping_page_gfp(). [akpm@linux-foundation.org: fix shmem_read_mapping_page_gfp(), per Matthew] Link: https://lkml.kernel.org/r/Y+QdJTuzxeBYejw2@casper.infradead.org Link: https://lkml.kernel.org/r/20230206162520.4029022-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mark Hemment Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 8 ++++++++ mm/shmem.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d09d54be4ffd..103d1000a5a2 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -109,6 +109,14 @@ enum sgp_type { int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp); + +static inline struct folio *shmem_read_folio(struct address_space *mapping, + pgoff_t index) +{ + return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); +} static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index 732969afabd1..be6bdd320d5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4311,9 +4311,9 @@ int shmem_zero_setup(struct vm_area_struct *vma) } /** - * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. - * @mapping: the page's address_space - * @index: the page index + * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. + * @mapping: the folio's address_space + * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", @@ -4325,13 +4325,12 @@ int shmem_zero_setup(struct vm_area_struct *vma) * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ -struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, - pgoff_t index, gfp_t gfp) +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; - struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); @@ -4341,6 +4340,25 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, return ERR_PTR(error); folio_unlock(folio); + return folio; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return mapping_read_folio_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); + +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); + struct page *page; + + if (IS_ERR(folio)) + return &folio->page; + page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); @@ -4348,11 +4366,5 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, } return page; -#else - /* - * The tiny !SHMEM case uses ramfs without swap - */ - return read_cache_page_gfp(mapping, index, gfp); -#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); From 5ff2121a3336a63aa7060cd71534d39dfb1cd2d1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Feb 2023 19:08:50 +0000 Subject: [PATCH 408/505] shmem: fix W=1 build warnings with CONFIG_SHMEM=n With W=1 and CONFIG_SHMEM=n, shmem.c functions have no prototypes so the compiler emits warnings. Link: https://lkml.kernel.org/r/20230206190850.4054983-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mark Hemment Cc: Charan Teja Kalla Cc: David Rientjes Cc: Hugh Dickins Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index be6bdd320d5f..577b3838c6b9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include #include From d76f99548cf0474c3bf75f25fab1778e03ade5f2 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:14 +0800 Subject: [PATCH 409/505] mm/vmalloc.c: add used_map into vmap_block to track space of vmap_block Patch series "mm/vmalloc.c: allow vread() to read out vm_map_ram areas", v5. Problem: *** Stephen reported vread() will skip vm_map_ram areas when reading out /proc/kcore with drgn utility. Please see below link to get more details. /proc/kcore reads 0's for vmap_block https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u Root cause: *** The normal vmalloc API uses struct vmap_area to manage the virtual kernel area allocated, and associate a vm_struct to store more information and pass out. However, area reserved through vm_map_ram() interface doesn't allocate vm_struct to associate with. So the current code in vread() will skip the vm_map_ram area through 'if (!va->vm)' conditional checking. Solution: *** To mark the area reserved through vm_map_ram() interface, add field 'flags' into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). And also add bitmap field 'used_map' into struct vmap_block to mark those further subdivided regions being used to differentiate with dirty and free regions in vmap_block. With the help of above vmap_area->flags and vmap_block->used_map, we can recognize and handle vm_map_ram areas successfully. All these are done in patch 1~3. Meanwhile, do some improvement on areas related to vm_map_ram areas in patch 4, 5. And also change area flag from VM_ALLOC to VM_IOREMAP in patch 6, 7 because this will show them as 'ioremap' in /proc/vmallocinfo, and exclude them from /proc/kcore. This patch (of 7): In one vmap_block area, there could be three types of regions: region being used which is allocated through vb_alloc(), dirty region which is freed via vb_free() and free region. Among them, only used region has available data. While there's no way to track those used regions currently. Here, add bitmap field used_map into vmap_block, and set/clear it during allocation or freeing regions of vmap_block area. This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20230206084020.174506-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1dd7ca258a76..0f1396d6fbe6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1910,6 +1910,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; @@ -1986,10 +1987,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; + bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -2099,6 +2102,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; + bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -2132,6 +2136,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); From 869176a096068056b338b5cc1b0af93106007f5d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:15 +0800 Subject: [PATCH 410/505] mm/vmalloc.c: add flags to mark vm_map_ram area Through vmalloc API, a virtual kernel area is reserved for physical address mapping. And vmap_area is used to track them, while vm_struct is allocated to associate with the vmap_area to store more information and passed out. However, area reserved via vm_map_ram() is an exception. It doesn't have vm_struct to associate with vmap_area. And we can't recognize the vmap_area with '->vm == NULL' as a vm_map_ram() area because the normal freeing path will set va->vm = NULL before unmapping, please see function remove_vm_area(). Meanwhile, there are two kinds of handling for vm_map_ram area. One is the whole vmap_area being reserved and mapped at one time through vm_map_area() interface; the other is the whole vmap_area with VMAP_BLOCK_SIZE size being reserved, while mapped into split regions with smaller size via vb_alloc(). To mark the area reserved through vm_map_ram(), add flags field into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, while bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa3437..69250efa03d1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -76,6 +76,7 @@ struct vmap_area { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; + unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f1396d6fbe6..2d0960190c42 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1578,7 +1578,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, gfp_t gfp_mask, + unsigned long va_flags) { struct vmap_area *va; unsigned long freed; @@ -1623,6 +1624,7 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; + va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); @@ -1901,6 +1903,10 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1976,7 +1982,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, - node, gfp_mask); + node, gfp_mask, + VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2285,7 +2292,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, + node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; @@ -2483,7 +2491,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; From 06c8994626d1b7d8c26dfd06992d67703a274054 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:16 +0800 Subject: [PATCH 411/505] mm/vmalloc.c: allow vread() to read out vm_map_ram areas Currently, vread can read out vmalloc areas which is associated with a vm_struct. While this doesn't work for areas created by vm_map_ram() interface because it doesn't have an associated vm_struct. Then in vread(), these areas are all skipped. Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. The area created with vmap_ram_vread() interface directly can be handled like the other normal vmap areas with aligned_vread(). While areas which will be further subdivided and managed with vmap_block need carefully read out page-aligned small regions and zero fill holes. Link: https://lkml.kernel.org/r/20230206084020.174506-4-bhe@redhat.com Reported-by: Stephen Brennan Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Tested-by: Stephen Brennan Cc: Dan Carpenter Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d0960190c42..7188a47315c2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3463,6 +3463,68 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + if (!vb) + goto finished; + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + goto finished; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + goto unlock; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } +unlock: + spin_unlock(&vb->lock); + +finished: + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3493,7 +3555,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3514,12 +3576,21 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + /* + * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need + * be set together with VMAP_RAM. + */ + WARN_ON(flags == VMAP_BLOCK); + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3529,10 +3600,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); From bba9697b42ead45687352fdd0fd498735bc4361d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:17 +0800 Subject: [PATCH 412/505] mm/vmalloc: explicitly identify vm_map_ram area when shown in /proc/vmcoreinfo Now, by marking VMAP_RAM in vmap_area->flags for vm_map_ram area, we can clearly differentiate it with other vmalloc areas. So identify vm_map_area area by checking VMAP_RAM of vmap_area->flags when shown in /proc/vmcoreinfo. Meanwhile, the code comment above vm_map_ram area checking in s_show() is not needed any more, remove it here. Link: https://lkml.kernel.org/r/20230206084020.174506-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7188a47315c2..87d71c783646 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4152,14 +4152,11 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); - /* - * s_show can encounter race with remove_vm_area, !vm on behalf - * of vmap area is being tear down or vm_map_ram allocation. - */ if (!va->vm) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); goto final; } From 30a7a9b17c4b0331ec67aadb4b30ff2a951b4ed5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:18 +0800 Subject: [PATCH 413/505] mm/vmalloc: skip the uninitilized vmalloc areas For areas allocated via vmalloc_xxx() APIs, it searches for unmapped area to reserve and allocates new pages to map into, please see function __vmalloc_node_range(). During the process, flag VM_UNINITIALIZED is set in vm->flags to indicate that the pages allocation and mapping haven't been done, until clear_vm_uninitialized_flag() is called to clear VM_UNINITIALIZED. For this kind of area, if VM_UNINITIALIZED is still set, let's ignore it in vread() because pages newly allocated and being mapped in that area only contains zero data. reading them out by aligned_vread() is wasting time. Link: https://lkml.kernel.org/r/20230206084020.174506-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 87d71c783646..3b57260b6d39 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3587,6 +3587,11 @@ long vread(char *buf, char *addr, unsigned long count) if (!vm && !flags) continue; + if (vm && (vm->flags & VM_UNINITIALIZED)) + continue; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); From 728e5ae07dd6cf72676e67e376671e52b7ddfac3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:19 +0800 Subject: [PATCH 414/505] powerpc: mm: add VM_IOREMAP flag to the vmalloc area Currently, for vmalloc areas with flag VM_IOREMAP set, except of the specific alignment clamping in __get_vm_area_node(), they will be 1) Shown as ioremap in /proc/vmallocinfo; 2) Ignored by /proc/kcore reading via vread() So for the io mapping in ioremap_phb() of ppc, we should set VM_IOREMAP in flag to make it handled correctly as above. Link: https://lkml.kernel.org/r/20230206084020.174506-7-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- arch/powerpc/kernel/pci_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 0c7cfb9fab04..fd42059ae2a5 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -132,7 +132,7 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) * address decoding but I'd rather not deal with those outside of the * reserved 64K legacy region. */ - area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + area = __get_vm_area_caller(size, VM_IOREMAP, PHB_IO_BASE, PHB_IO_END, __builtin_return_address(0)); if (!area) return NULL; From 61a1a9906f66bd0eaaf9bade96f22a60f04240b7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:20 +0800 Subject: [PATCH 415/505] sh: mm: set VM_IOREMAP flag to the vmalloc area Currently, for vmalloc areas with flag VM_IOREMAP set, except of the specific alignment clamping in __get_vm_area_node(), they will be 1) Shown as ioremap in /proc/vmallocinfo; 2) Ignored by /proc/kcore reading via vread() So for the ioremap in __sq_remap() of sh, we should set VM_IOREMAP in flag to make it handled correctly as above. Link: https://lkml.kernel.org/r/20230206084020.174506-8-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- arch/sh/kernel/cpu/sh4/sq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index a76b94e41e91..27f2e3da5aa2 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,7 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + vma = __get_vm_area_caller(map->size, VM_IOREMAP, map->sq_addr, SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; From b2a72dff85fa27fd80349a4f7fae8a6e9bcbbd15 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:22 -0400 Subject: [PATCH 416/505] mm/gup: have internal functions get the mmap_read_lock() Patch series "Simplify the external interface for GUP", v2. It is quite a maze of EXPORTED symbols leading up to the three actual worker functions of GUP. Simplify this by reorganizing some of the code so the EXPORTED symbols directly call the correct internal function with validated and consistent arguments. Consolidate all the assertions into one place at the top of the call chains. Remove some dead code. Move more things into the mm/internal.h header This patch (of 13): __get_user_pages_locked() and __gup_longterm_locked() both require the mmap lock to be held. They have a slightly unusual locked parameter that is used to allow these functions to unlock and relock the mmap lock and convey that fact to the caller. Several places wrap these functions with a simple mmap_read_lock() just so they can follow the optimized locked protocol. Consolidate this internally to the functions. Allow internal callers to set locked = 0 to cause the functions to acquire and release the lock on their own. Reorganize __gup_longterm_locked() to use the autolocking in __get_user_pages_locked(). Replace all the places obtaining the mmap_read_lock() just to call __get_user_pages_locked() with the new mechanism. Replace all the internal callers of get_user_pages_unlocked() with direct calls to __gup_longterm_locked() using the new mechanism. A following patch will add assertions ensuring the external interface continues to always pass in locked = 1. Link: https://lkml.kernel.org/r/0-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Link: https://lkml.kernel.org/r/1-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Claudio Imbrenda Signed-off-by: Andrew Morton --- mm/gup.c | 113 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index c4b793385ed2..5b03d2fd3e1c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1331,8 +1331,17 @@ static bool gup_signal_pending(unsigned int flags) } /* - * Please note that this function, unlike __get_user_pages will not - * return 0 for nr_pages > 0 without FOLL_NOWAIT + * Locking: (*locked == 1) means that the mmap_lock has already been acquired by + * the caller. This function may drop the mmap_lock. If it does so, then it will + * set (*locked = 0). + * + * (*locked == 0) means that the caller expects this function to acquire and + * drop the mmap_lock. Therefore, the value of *locked will still be zero when + * the function returns, even though it may have changed temporarily during + * function execution. + * + * Please note that this function, unlike __get_user_pages(), will not return 0 + * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, @@ -1343,13 +1352,22 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned int flags) { long ret, pages_done; - bool lock_dropped; + bool must_unlock = false; if (locked) { /* if VM_FAULT_RETRY can be returned, vmas become invalid */ BUG_ON(vmas); - /* check caller initialized locked */ - BUG_ON(*locked != 1); + } + + /* + * The internal caller expects GUP to manage the lock internally and the + * lock must be released when this returns. + */ + if (locked && !*locked) { + if (mmap_read_lock_killable(mm)) + return -EAGAIN; + must_unlock = true; + *locked = 1; } if (flags & FOLL_PIN) @@ -1368,7 +1386,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, flags |= FOLL_GET; pages_done = 0; - lock_dropped = false; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); @@ -1404,7 +1421,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; - lock_dropped = true; + + /* The lock was temporarily dropped, so we must unlock later */ + must_unlock = true; retry: /* @@ -1451,10 +1470,11 @@ retry: pages++; start += PAGE_SIZE; } - if (lock_dropped && *locked) { + if (must_unlock && *locked) { /* - * We must let the caller know we temporarily dropped the lock - * and so the critical section protected by it was lost. + * We either temporarily dropped the lock, or the caller + * requested that we both acquire and drop the lock. Either way, + * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; @@ -1659,9 +1679,24 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned int foll_flags) { struct vm_area_struct *vma; + bool must_unlock = false; unsigned long vm_flags; long i; + if (!nr_pages) + return 0; + + /* + * The internal caller expects GUP to manage the lock internally and the + * lock must be released when this returns. + */ + if (locked && !*locked) { + if (mmap_read_lock_killable(mm)) + return -EAGAIN; + must_unlock = true; + *locked = 1; + } + /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ @@ -1673,12 +1708,12 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) - goto finish_or_fault; + break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) - goto finish_or_fault; + break; if (pages) { pages[i] = virt_to_page((void *)start); @@ -1690,9 +1725,11 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, start = (start + PAGE_SIZE) & PAGE_MASK; } - return i; + if (must_unlock && *locked) { + mmap_read_unlock(mm); + *locked = 0; + } -finish_or_fault: return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ @@ -1861,17 +1898,13 @@ EXPORT_SYMBOL(fault_in_readable); #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { - struct mm_struct *mm = current->mm; struct page *page; - int locked = 1; + int locked = 0; int ret; - if (mmap_read_lock_killable(mm)) - return NULL; - ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, + &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); - if (locked) - mmap_read_unlock(mm); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ @@ -2047,13 +2080,9 @@ static long __gup_longterm_locked(struct mm_struct *mm, int *locked, unsigned int gup_flags) { - bool must_unlock = false; unsigned int flags; long rc, nr_pinned_pages; - if (locked && WARN_ON_ONCE(!*locked)) - return -EINVAL; - if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); @@ -2070,11 +2099,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, return -EINVAL; flags = memalloc_pin_save(); do { - if (locked && !*locked) { - mmap_read_lock(mm); - must_unlock = true; - *locked = 1; - } nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); @@ -2085,11 +2109,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); - - if (locked && *locked && must_unlock) { - mmap_read_unlock(mm); - *locked = 0; - } return rc ? rc : nr_pinned_pages; } @@ -2242,16 +2261,10 @@ EXPORT_SYMBOL(get_user_pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - struct mm_struct *mm = current->mm; - int locked = 1; - long ret; + int locked = 0; - mmap_read_lock(mm); - ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, - gup_flags | FOLL_TOUCH); - if (locked) - mmap_read_unlock(mm); - return ret; + return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2904,6 +2917,7 @@ static int internal_get_user_pages_fast(unsigned long start, { unsigned long len, end; unsigned long nr_pinned; + int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | @@ -2932,8 +2946,9 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, - gup_flags); + ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, + pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3183,11 +3198,13 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE(gup_flags & FOLL_GET)) return -EINVAL; + int locked = 0; if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); + gup_flags |= FOLL_PIN | FOLL_TOUCH; + return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); From 7427c30bea1449a885a1dd9baf991aaad26209ce Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:23 -0400 Subject: [PATCH 417/505] mm/gup: remove obsolete FOLL_LONGTERM comment These days FOLL_LONGTERM is not allowed at all on any get_user_pages*() functions, it must be only be used with pin_user_pages*(), plus it now has universal support for all the pin_user_pages*() functions. Link: https://lkml.kernel.org/r/2-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10a1e41f4e70..4396c7bf06d1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1053,12 +1053,6 @@ typedef unsigned int __bitwise zap_flags_t; * specifically failed. Filesystem pages are still subject to bugs and use of * FOLL_LONGTERM should be avoided on those pages. * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * * In the CMA case: long term pins in a CMA region would unnecessarily fragment * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. From afa3c33e2684c2eec4f47d83d2859b76f3568be6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:24 -0400 Subject: [PATCH 418/505] mm/gup: don't call __gup_longterm_locked() if FOLL_LONGTERM cannot be set get_user_pages_remote(), get_user_pages_unlocked() and get_user_pages() are never called with FOLL_LONGTERM, so directly call __get_user_pages_locked() The next patch will add an assertion for this. Link: https://lkml.kernel.org/r/3-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Suggested-by: John Hubbard Reviewed-by: John Hubbard Acked-by: Mike Rapoport (IBM) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 5b03d2fd3e1c..9bd4d775716a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2200,8 +2200,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2238,8 +2238,8 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, NULL, gup_flags | FOLL_TOUCH); + return __get_user_pages_locked(current->mm, start, nr_pages, pages, + vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2263,8 +2263,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + return __get_user_pages_locked(current->mm, start, nr_pages, pages, + NULL, &locked, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); From 7ce154fe6917e7db94d63bc4d6c73b678ad1c581 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:25 -0400 Subject: [PATCH 419/505] mm/gup: move try_grab_page() to mm/internal.h This is part of the internal function of gup.c and is only non-static so that the parts of gup.c in the huge_memory.c and hugetlb.c can call it. Put it in internal.h beside the similarly purposed try_grab_folio() Link: https://lkml.kernel.org/r/4-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/internal.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ed0cb2401f5..afefc166b349 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1268,8 +1268,6 @@ static inline void get_page(struct page *page) folio_get(page_folio(page)); } -int __must_check try_grab_page(struct page *page, unsigned int flags); - static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); diff --git a/mm/internal.h b/mm/internal.h index 90bb2078444c..3f75763b0fc2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -856,6 +856,7 @@ int migrate_device_coherent_page(struct page *page); * mm/gup.c */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); extern bool mirrored_kernelcore; From d64e2dbc33a109a37ad4f5c18945c324345fe873 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:26 -0400 Subject: [PATCH 420/505] mm/gup: simplify the external interface functions and consolidate invariants The GUP family of functions have a complex, but fairly well defined, set of invariants for their arguments. Currently these are sprinkled about, sometimes in duplicate through many functions. Internally we don't follow all the invariants that the external interface has to follow, so place these checks directly at the exported interface. This ensures the internal functions never reach a violated invariant. Remove the duplicated invariant checks. The end result is to make these functions fully internal: __get_user_pages_locked() internal_get_user_pages_fast() __gup_longterm_locked() And all the other functions call directly into one of these. Link: https://lkml.kernel.org/r/5-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Suggested-by: John Hubbard Reviewed-by: John Hubbard Acked-by: Mike Rapoport (IBM) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 159 +++++++++++++++++++++++------------------------ mm/huge_memory.c | 10 --- 2 files changed, 78 insertions(+), 91 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 9bd4d775716a..029de13c6718 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -215,7 +215,6 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; @@ -818,7 +817,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (vma_is_secretmem(vma)) return NULL; - if (foll_flags & FOLL_PIN) + if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; page = follow_page_mask(vma, address, foll_flags, &ctx); @@ -975,9 +974,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; - if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA)) - return -EOPNOTSUPP; - if (vma_is_secretmem(vma)) return -EFAULT; @@ -1354,11 +1350,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, long ret, pages_done; bool must_unlock = false; - if (locked) { - /* if VM_FAULT_RETRY can be returned, vmas become invalid */ - BUG_ON(vmas); - } - /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. @@ -2087,16 +2078,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags); - /* - * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM - * implies FOLL_PIN (although the reverse is not true). Therefore it is - * correct to unconditionally call check_and_migrate_movable_pages() - * which assumes pages have been pinned via FOLL_PIN. - * - * Enforce the above reasoning by asserting that FOLL_PIN is set. - */ - if (WARN_ON(!(gup_flags & FOLL_PIN))) - return -EINVAL; flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, @@ -2106,28 +2087,66 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = nr_pinned_pages; break; } + + /* FOLL_LONGTERM implies FOLL_PIN */ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); return rc ? rc : nr_pinned_pages; } -static bool is_valid_gup_flags(unsigned int gup_flags) +/* + * Check that the given flags are valid for the exported gup/pup interface, and + * update them with the required flags that the caller must have set. + */ +static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int *gup_flags_p, + unsigned int to_set) { + unsigned int gup_flags = *gup_flags_p; + /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: + * These flags not allowed to be specified externally to the gup + * interfaces: + * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only + * - FOLL_REMOTE is internal only and used on follow_page() */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) - return false; - /* - * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying - * that is, FOLL_LONGTERM is a specific case, more restrictive case of - * FOLL_PIN. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | + FOLL_REMOTE | FOLL_FAST_ONLY))) return false; + gup_flags |= to_set; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return false; + + /* LONGTERM can only be specified when pinning */ + if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) + return false; + + /* Pages input must be given if using GET/PIN */ + if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) + return false; + + /* At the external interface locked must be set */ + if (WARN_ON_ONCE(locked && *locked != 1)) + return false; + + /* We want to allow the pgmap to be hot-unplugged at all times */ + if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && + (gup_flags & FOLL_PCI_P2PDMA))) + return false; + + /* + * Can't use VMAs with locked, as locked allows GUP to unlock + * which invalidates the vmas array + */ + if (WARN_ON_ONCE(vmas && locked)) + return false; + + *gup_flags_p = gup_flags; return true; } @@ -2197,11 +2216,12 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - if (!is_valid_gup_flags(gup_flags)) + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); + gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2235,11 +2255,11 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - if (!is_valid_gup_flags(gup_flags)) + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, NULL, gup_flags | FOLL_TOUCH); + vmas, NULL, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -2263,8 +2283,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) + return -EINVAL; + return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags | FOLL_TOUCH); + NULL, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2992,7 +3015,9 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - gup_flags |= FOLL_GET | FOLL_FAST_ONLY; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_GET | FOLL_FAST_ONLY)) + return -EINVAL; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -3029,16 +3054,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - if (!is_valid_gup_flags(gup_flags)) - return -EINVAL; - /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ - gup_flags |= FOLL_GET; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) + return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -3062,14 +3085,8 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - - gup_flags |= FOLL_PIN; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); @@ -3085,20 +3102,14 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages, { int nr_pinned; - /* - * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API - * rules require returning 0, rather than -errno: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return 0; - - if (WARN_ON_ONCE(!pages)) - return 0; /* * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_PIN | FOLL_FAST_ONLY)) + return 0; + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); /* @@ -3140,16 +3151,11 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) + return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, - gup_flags | FOLL_PIN | FOLL_TOUCH | - FOLL_REMOTE); + gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3174,14 +3180,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - if (WARN_ON_ONCE(!pages)) - return -EINVAL; - - gup_flags |= FOLL_PIN; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, NULL, gup_flags); } @@ -3195,15 +3195,12 @@ EXPORT_SYMBOL(pin_user_pages); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; int locked = 0; - if (WARN_ON_ONCE(!pages)) - return -EINVAL; + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_PIN | FOLL_TOUCH)) + return 0; - gup_flags |= FOLL_PIN | FOLL_TOUCH; return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, &locked, gup_flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d6977dc6b31..1343a7d88299 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1042,11 +1042,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return NULL; - if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -1205,11 +1200,6 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return NULL; - if (pud_present(*pud) && pud_devmap(*pud)) /* pass */; else From 961ba47242510ac7af7c66733e471b9d3a6ade1a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:27 -0400 Subject: [PATCH 421/505] mm/gup: add an assertion that the mmap lock is locked Since commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") we already have this assertion, it is just buried in find_vma(): __get_user_pages_locked() __get_user_pages() find_extend_vma() find_vma() Also check it at the top of __get_user_pages_locked() as a form of documentation. Link: https://lkml.kernel.org/r/6-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/gup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 029de13c6718..bc3f56b7621e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1360,6 +1360,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, must_unlock = true; *locked = 1; } + else + mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); From 6e4382c706f7701c4e4e430ebc9f817eeda64857 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:28 -0400 Subject: [PATCH 422/505] mm/gup: remove locked being NULL from faultin_vma_page_range() The only caller of this function always passes in a non-NULL locked, so just remove this obsolete comment. Link: https://lkml.kernel.org/r/7-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/gup.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index bc3f56b7621e..3afcd042f426 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1558,12 +1558,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * code on error (see __get_user_pages()). * * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. - * - * If @locked is NULL, it may be held for read or write and will be unperturbed. - * - * If @locked is non-NULL, it must held for read only and may be released. If - * it's released, *@locked will be set to 0. + * covered by the VMA. If it's released, *@locked will be set to 0. */ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked) From f04740f54594f85935e29a5c8ff6722f427f3dac Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:29 -0400 Subject: [PATCH 423/505] mm/gup: add FOLL_UNLOCKABLE Setting FOLL_UNLOCKABLE allows GUP to lock/unlock the mmap lock on its own. It is a more explicit replacement for locked != NULL. This clears the way for passing in locked = 1, without intending that the lock can be unlocked. Set the flag in all cases where it is used, eg locked is present in the external interface or locked is used internally with locked = 0. Link: https://lkml.kernel.org/r/8-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 1 + mm/gup.c | 36 +++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4396c7bf06d1..434b3ac8a351 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1104,5 +1104,6 @@ typedef unsigned int __bitwise zap_flags_t; #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ #define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ #define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ +#define FOLL_UNLOCKABLE 0x400000 /* allow unlocking the mmap lock (internal only) */ #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/gup.c b/mm/gup.c index 3afcd042f426..310fc6ab967e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -896,7 +896,7 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) { + if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set @@ -1382,9 +1382,11 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); - if (!locked) + if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ - return ret; + pages_done = ret; + break; + } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { @@ -1532,6 +1534,9 @@ long populate_vma_page_range(struct vm_area_struct *vma, if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; + if (locked) + gup_flags |= FOLL_UNLOCKABLE; + /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. @@ -1583,7 +1588,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; if (write) gup_flags |= FOLL_WRITE; @@ -2107,12 +2112,20 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * interfaces: * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ - if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | + if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | FOLL_REMOTE | FOLL_FAST_ONLY))) return false; gup_flags |= to_set; + if (locked) { + /* At the external interface locked must be set */ + if (WARN_ON_ONCE(*locked != 1)) + return false; + + gup_flags |= FOLL_UNLOCKABLE; + } /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == @@ -2127,10 +2140,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; - /* At the external interface locked must be set */ - if (WARN_ON_ONCE(locked && *locked != 1)) - return false; - /* We want to allow the pgmap to be hot-unplugged at all times */ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))) @@ -2140,7 +2149,7 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * Can't use VMAs with locked, as locked allows GUP to unlock * which invalidates the vmas array */ - if (WARN_ON_ONCE(vmas && locked)) + if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) return false; *gup_flags_p = gup_flags; @@ -2280,7 +2289,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, @@ -2968,7 +2978,7 @@ static int internal_get_user_pages_fast(unsigned long start, pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, NULL, &locked, - gup_flags | FOLL_TOUCH); + gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3195,7 +3205,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 0; if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, - FOLL_PIN | FOLL_TOUCH)) + FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, From 9a863a6a51394bff480c959b713874c090a8f5c6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:30 -0400 Subject: [PATCH 424/505] mm/gup: make locked never NULL in the internal GUP functions Now that NULL locked doesn't have a special meaning we can just make it non-NULL in all cases and remove the special tests. get_user_pages() and pin_user_pages() can safely pass in a locked = 1 get_user_pages_remote) and pin_user_pages_remote() can swap in a local variable for locked if NULL is passed. Remove all the NULL checks. Link: https://lkml.kernel.org/r/9-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 310fc6ab967e..4ce88e00e1c6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -879,9 +879,9 @@ unmap: } /* - * mmap_lock must be held on entry. If @locked != NULL and *@flags - * does not include FOLL_NOWAIT, the mmap_lock may be released. If it - * is, *@locked will be set to 0 and -EBUSY returned. + * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not + * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set + * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, bool unshare, @@ -930,8 +930,8 @@ static int faultin_page(struct vm_area_struct *vma, * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); - if (locked) - *locked = 0; + *locked = 0; + /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of @@ -951,7 +951,7 @@ static int faultin_page(struct vm_area_struct *vma, } if (ret & VM_FAULT_RETRY) { - if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } @@ -1062,14 +1062,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @locked != NULL, *@locked will be set to 0 when mmap_lock is - * released by an up_read(). That can happen if @gup_flags does not - * have FOLL_NOWAIT. + * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may + * be released. If this happens *@locked will be set to 0 on return. * - * A caller using such a combination of @locked and @gup_flags - * must therefore hold the mmap_lock for reading only, and recognize - * when it's been released. Otherwise, it must be held for either - * reading or writing and will not be released. + * A caller using such a combination of @gup_flags must therefore hold the + * mmap_lock for reading only, and recognize when it's been released. Otherwise, + * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -1121,7 +1119,7 @@ static long __get_user_pages(struct mm_struct *mm, i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags, locked); - if (locked && *locked == 0) { + if (!*locked) { /* * We've got a VM_FAULT_RETRY * and we've lost mmap_lock. @@ -1354,7 +1352,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ - if (locked && !*locked) { + if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; @@ -1502,6 +1500,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; + int local_locked = 1; int gup_flags; long ret; @@ -1542,7 +1541,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked); + NULL, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } @@ -1683,7 +1682,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ - if (locked && !*locked) { + if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; @@ -2222,11 +2221,14 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + int local_locked = 1; + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2261,11 +2263,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + int locked = 1; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, NULL, gup_flags); + vmas, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -3158,10 +3162,13 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + int local_locked = 1; + if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3187,10 +3194,12 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + int locked = 1; + if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, NULL, gup_flags); + pages, vmas, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); From edad1bb1fbf7e28b49bf76b2aa66bfcaba00f627 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:31 -0400 Subject: [PATCH 425/505] mm/gup: remove pin_user_pages_fast_only() Commit ed29c2691188 ("drm/i915: Fix userptr so we do not have to worry about obj->mm.lock, v7.") removed the only caller, remove this dead code too. Link: https://lkml.kernel.org/r/10-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/gup.c | 33 --------------------------------- 2 files changed, 35 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index afefc166b349..a0d59645dcf9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2261,8 +2261,6 @@ extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) diff --git a/mm/gup.c b/mm/gup.c index 4ce88e00e1c6..91c8ab53f43f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3102,39 +3102,6 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); -/* - * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior - * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. - * - * The API rules are the same, too: no negative values may be returned. - */ -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int nr_pinned; - - /* - * FOLL_FAST_ONLY is required in order to match the API description of - * this routine: no fall back to regular ("slow") GUP. - */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, - FOLL_PIN | FOLL_FAST_ONLY)) - return 0; - - nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, - pages); - /* - * This routine is not allowed to return negative values. However, - * internal_get_user_pages_fast() *can* return -errno. Therefore, - * correct for that here: - */ - if (nr_pinned < 0) - nr_pinned = 0; - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); - /** * pin_user_pages_remote() - pin pages of a remote process * From 9198a9196ee67814a101c178ed828f8ea9c2965e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:32 -0400 Subject: [PATCH 426/505] mm/gup: make get_user_pages_fast_only() return the common return value There are only two callers, both can handle the common return code: - get_user_page_fast_only() checks == 1 - gfn_to_page_many_atomic() already returns -1, and the only caller checks for negative return values Remove the restriction against returning negative values. Link: https://lkml.kernel.org/r/11-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Acked-by: Mike Rapoport (IBM) Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Signed-off-by: Andrew Morton --- mm/gup.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 91c8ab53f43f..91d047096c09 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3005,8 +3005,6 @@ static int internal_get_user_pages_fast(unsigned long start, * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. * * If the architecture does not support this function, simply return with no * pages pinned. @@ -3018,7 +3016,6 @@ static int internal_get_user_pages_fast(unsigned long start, int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - int nr_pinned; /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. @@ -3030,19 +3027,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; - nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, - pages); - - /* - * As specified in the API description above, this routine is not - * allowed to return negative values. However, the common core - * routine internal_get_user_pages_fast() *can* return -errno. - * Therefore, correct for that here: - */ - if (nr_pinned < 0) - nr_pinned = 0; - - return nr_pinned; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); From 63b605128655f2e3968d99e30b293c7e7eaa2fc2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:33 -0400 Subject: [PATCH 427/505] mm/gup: move gup_must_unshare() to mm/internal.h This function is only used in gup.c and closely related. It touches FOLL_PIN so it must be moved before the next patch. Link: https://lkml.kernel.org/r/12-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 65 ---------------------------------------------- mm/internal.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a0d59645dcf9..4a0695ef969a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3180,71 +3180,6 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } -/* - * Indicates for which pages that are write-protected in the page table, - * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the - * GUP pin will remain consistent with the pages mapped into the page tables - * of the MM. - * - * Temporary unmapping of PageAnonExclusive() pages or clearing of - * PageAnonExclusive() has to protect against concurrent GUP: - * * Ordinary GUP: Using the PT lock - * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): see - * page_try_share_anon_rmap() - * - * Must be called with the (sub)page that's actually referenced via the - * page table entry, which might not necessarily be the head page for a - * PTE-mapped THP. - * - * If the vma is NULL, we're coming from the GUP-fast path and might have - * to fallback to the slow path just to lookup the vma. - */ -static inline bool gup_must_unshare(struct vm_area_struct *vma, - unsigned int flags, struct page *page) -{ - /* - * FOLL_WRITE is implicitly handled correctly as the page table entry - * has to be writable -- and if it references (part of) an anonymous - * folio, that part is required to be marked exclusive. - */ - if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) - return false; - /* - * Note: PageAnon(page) is stable until the page is actually getting - * freed. - */ - if (!PageAnon(page)) { - /* - * We only care about R/O long-term pining: R/O short-term - * pinning does not have the semantics to observe successive - * changes through the process page tables. - */ - if (!(flags & FOLL_LONGTERM)) - return false; - - /* We really need the vma ... */ - if (!vma) - return true; - - /* - * ... because we only care about writable private ("COW") - * mappings where we have to break COW early. - */ - return is_cow_mapping(vma->vm_flags); - } - - /* Paired with a memory barrier in page_try_share_anon_rmap(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) - smp_rmb(); - - /* - * Note that PageKsm() pages cannot be exclusive, and consequently, - * cannot get pinned. - */ - return !PageAnonExclusive(page); -} - /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. diff --git a/mm/internal.h b/mm/internal.h index 3f75763b0fc2..4f5ca3401b05 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,71 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. + */ +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) From 2c2241081f7dec878331fdc3a3f2361e99556bca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:34 -0400 Subject: [PATCH 428/505] mm/gup: move private gup FOLL_ flags to internal.h Move the flags that should not/are not used outside gup.c and related into mm/internal.h to discourage driver abuse. To make this more maintainable going forward compact the two FOLL ranges with new bit numbers from 0 to 11 and 16 to 21, using shifts so it is explicit. Switch to an enum so the whole thing is easier to read. Link: https://lkml.kernel.org/r/13-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Acked-by: David Hildenbrand Cc: David Howells Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: Alistair Popple Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 57 ++++++++++++++++++++++++---------------- mm/internal.h | 15 +++++++++++ 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 434b3ac8a351..56753d0f096d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1040,9 +1040,6 @@ typedef unsigned int __bitwise zap_flags_t; * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm @@ -1086,24 +1083,40 @@ typedef unsigned int __bitwise zap_flags_t; * Please see Documentation/core-api/pin_user_pages.rst for more information. */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ -#define FOLL_UNLOCKABLE 0x400000 /* allow unlocking the mmap lock (internal only) */ +enum { + /* check pte is writable */ + FOLL_WRITE = 1 << 0, + /* do get_page on page */ + FOLL_GET = 1 << 1, + /* give error on hole if it would be zero */ + FOLL_DUMP = 1 << 2, + /* get_user_pages read/write w/o permission */ + FOLL_FORCE = 1 << 3, + /* + * if a disk transfer is needed, start the IO and return without waiting + * upon it + */ + FOLL_NOWAIT = 1 << 4, + /* do not fault in pages */ + FOLL_NOFAULT = 1 << 5, + /* check page is hwpoisoned */ + FOLL_HWPOISON = 1 << 6, + /* don't do file mappings */ + FOLL_ANON = 1 << 7, + /* + * FOLL_LONGTERM indicates that the page will be held for an indefinite + * time period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + */ + FOLL_LONGTERM = 1 << 8, + /* split huge pmd before returning */ + FOLL_SPLIT_PMD = 1 << 9, + /* allow returning PCI P2PDMA pages */ + FOLL_PCI_P2PDMA = 1 << 10, + /* allow interrupts from generic signals */ + FOLL_INTERRUPTIBLE = 1 << 11, + + /* See also internal only FOLL flags in mm/internal.h */ +}; #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/internal.h b/mm/internal.h index 4f5ca3401b05..dfb37e94e140 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,21 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +enum { + /* mark page accessed */ + FOLL_TOUCH = 1 << 16, + /* a retry, previous pass started an IO */ + FOLL_TRIED = 1 << 17, + /* we are working on non-current tsk/mm */ + FOLL_REMOTE = 1 << 18, + /* pages must be released via unpin_user_page */ + FOLL_PIN = 1 << 19, + /* gup_fast: prevent fall-back to slow gup */ + FOLL_FAST_ONLY = 1 << 20, + /* allow unlocking the mmap lock */ + FOLL_UNLOCKABLE = 1 << 21, +}; + /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the From e56397e8c40da82c78dccb6f48bfa21e88ccb1e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 7 Feb 2023 19:21:15 +0000 Subject: [PATCH 429/505] mm/damon/sysfs: make kobj_type structures constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.") the driver core allows the usage of const struct kobj_type. Take advantage of this to constify the structure definitions to prevent modification at runtime. Link: https://lkml.kernel.org/r/20230207-kobj_type-damon-v1-1-9d4fea6a465b@weissschuh.net Signed-off-by: Thomas Weißschuh Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.c | 2 +- mm/damon/sysfs-common.h | 4 ++-- mm/damon/sysfs-schemes.c | 18 +++++++++--------- mm/damon/sysfs.c | 22 +++++++++++----------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 52bebf242f74..70edf45c2174 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -99,7 +99,7 @@ static struct attribute *damon_sysfs_ul_range_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_ul_range); -struct kobj_type damon_sysfs_ul_range_ktype = { +const struct kobj_type damon_sysfs_ul_range_ktype = { .release = damon_sysfs_ul_range_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_ul_range_groups, diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 604a6cbc3ede..db677eba78fd 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -21,7 +21,7 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( unsigned long max); void damon_sysfs_ul_range_release(struct kobject *kobj); -extern struct kobj_type damon_sysfs_ul_range_ktype; +extern const struct kobj_type damon_sysfs_ul_range_ktype; /* * schemes directory @@ -36,7 +36,7 @@ struct damon_sysfs_schemes { struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); -extern struct kobj_type damon_sysfs_schemes_ktype; +extern const struct kobj_type damon_sysfs_schemes_ktype; int damon_sysfs_set_schemes(struct damon_ctx *ctx, struct damon_sysfs_schemes *sysfs_schemes); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 86edca66aab1..3cdad5a7f936 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -103,7 +103,7 @@ static struct attribute *damon_sysfs_scheme_region_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); -static struct kobj_type damon_sysfs_scheme_region_ktype = { +static const struct kobj_type damon_sysfs_scheme_region_ktype = { .release = damon_sysfs_scheme_region_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_region_groups, @@ -153,7 +153,7 @@ static struct attribute *damon_sysfs_scheme_regions_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); -static struct kobj_type damon_sysfs_scheme_regions_ktype = { +static const struct kobj_type damon_sysfs_scheme_regions_ktype = { .release = damon_sysfs_scheme_regions_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_regions_groups, @@ -252,7 +252,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_stats); -static struct kobj_type damon_sysfs_stats_ktype = { +static const struct kobj_type damon_sysfs_stats_ktype = { .release = damon_sysfs_stats_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_stats_groups, @@ -678,7 +678,7 @@ static struct attribute *damon_sysfs_watermarks_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_watermarks); -static struct kobj_type damon_sysfs_watermarks_ktype = { +static const struct kobj_type damon_sysfs_watermarks_ktype = { .release = damon_sysfs_watermarks_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_watermarks_groups, @@ -789,7 +789,7 @@ static struct attribute *damon_sysfs_weights_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_weights); -static struct kobj_type damon_sysfs_weights_ktype = { +static const struct kobj_type damon_sysfs_weights_ktype = { .release = damon_sysfs_weights_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_weights_groups, @@ -920,7 +920,7 @@ static struct attribute *damon_sysfs_quotas_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); -static struct kobj_type damon_sysfs_quotas_ktype = { +static const struct kobj_type damon_sysfs_quotas_ktype = { .release = damon_sysfs_quotas_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_quotas_groups, @@ -1019,7 +1019,7 @@ static struct attribute *damon_sysfs_access_pattern_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); -static struct kobj_type damon_sysfs_access_pattern_ktype = { +static const struct kobj_type damon_sysfs_access_pattern_ktype = { .release = damon_sysfs_access_pattern_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_access_pattern_groups, @@ -1279,7 +1279,7 @@ static struct attribute *damon_sysfs_scheme_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); -static struct kobj_type damon_sysfs_scheme_ktype = { +static const struct kobj_type damon_sysfs_scheme_ktype = { .release = damon_sysfs_scheme_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_scheme_groups, @@ -1396,7 +1396,7 @@ static struct attribute *damon_sysfs_schemes_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_schemes); -struct kobj_type damon_sysfs_schemes_ktype = { +const struct kobj_type damon_sysfs_schemes_ktype = { .release = damon_sysfs_schemes_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_schemes_groups, diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index aeb0beb1da91..33e1d5c9cb54 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -81,7 +81,7 @@ static struct attribute *damon_sysfs_region_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_region); -static struct kobj_type damon_sysfs_region_ktype = { +static const struct kobj_type damon_sysfs_region_ktype = { .release = damon_sysfs_region_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_region_groups, @@ -198,7 +198,7 @@ static struct attribute *damon_sysfs_regions_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_regions); -static struct kobj_type damon_sysfs_regions_ktype = { +static const struct kobj_type damon_sysfs_regions_ktype = { .release = damon_sysfs_regions_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_regions_groups, @@ -277,7 +277,7 @@ static struct attribute *damon_sysfs_target_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_target); -static struct kobj_type damon_sysfs_target_ktype = { +static const struct kobj_type damon_sysfs_target_ktype = { .release = damon_sysfs_target_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_target_groups, @@ -402,7 +402,7 @@ static struct attribute *damon_sysfs_targets_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_targets); -static struct kobj_type damon_sysfs_targets_ktype = { +static const struct kobj_type damon_sysfs_targets_ktype = { .release = damon_sysfs_targets_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_targets_groups, @@ -530,7 +530,7 @@ static struct attribute *damon_sysfs_intervals_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_intervals); -static struct kobj_type damon_sysfs_intervals_ktype = { +static const struct kobj_type damon_sysfs_intervals_ktype = { .release = damon_sysfs_intervals_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_intervals_groups, @@ -612,7 +612,7 @@ static struct attribute *damon_sysfs_attrs_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_attrs); -static struct kobj_type damon_sysfs_attrs_ktype = { +static const struct kobj_type damon_sysfs_attrs_ktype = { .release = damon_sysfs_attrs_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_attrs_groups, @@ -800,7 +800,7 @@ static struct attribute *damon_sysfs_context_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_context); -static struct kobj_type damon_sysfs_context_ktype = { +static const struct kobj_type damon_sysfs_context_ktype = { .release = damon_sysfs_context_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_context_groups, @@ -926,7 +926,7 @@ static struct attribute *damon_sysfs_contexts_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_contexts); -static struct kobj_type damon_sysfs_contexts_ktype = { +static const struct kobj_type damon_sysfs_contexts_ktype = { .release = damon_sysfs_contexts_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_contexts_groups, @@ -1564,7 +1564,7 @@ static struct attribute *damon_sysfs_kdamond_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_kdamond); -static struct kobj_type damon_sysfs_kdamond_ktype = { +static const struct kobj_type damon_sysfs_kdamond_ktype = { .release = damon_sysfs_kdamond_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_kdamond_groups, @@ -1707,7 +1707,7 @@ static struct attribute *damon_sysfs_kdamonds_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_kdamonds); -static struct kobj_type damon_sysfs_kdamonds_ktype = { +static const struct kobj_type damon_sysfs_kdamonds_ktype = { .release = damon_sysfs_kdamonds_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_kdamonds_groups, @@ -1757,7 +1757,7 @@ static struct attribute *damon_sysfs_ui_dir_attrs[] = { }; ATTRIBUTE_GROUPS(damon_sysfs_ui_dir); -static struct kobj_type damon_sysfs_ui_dir_ktype = { +static const struct kobj_type damon_sysfs_ui_dir_ktype = { .release = damon_sysfs_ui_dir_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = damon_sysfs_ui_dir_groups, From 223ec6ab265ead0b319bc2f15d0d1be05078a74b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 7 Feb 2023 06:27:00 +0000 Subject: [PATCH 430/505] mm/memremap.c: fix outdated comment in devm_memremap_pages commit a4574f63edc6 ("mm/memremap_pages: convert to 'struct range'") converted res to range, update the comment correspondingly. Link: https://lkml.kernel.org/r/1675751220-2-1-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Cc: Dan Williams Signed-off-by: Andrew Morton --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index 2f88f43d4a01..bee85560a243 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -385,7 +385,7 @@ EXPORT_SYMBOL_GPL(memremap_pages); * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res and type members of @pgmap must be initialized + * 1/ At a minimum the range and type members of @pgmap must be initialized * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case From f528260b1a7d52140dfeb58857e13fc98ac193ef Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 13 Feb 2023 13:43:24 -0800 Subject: [PATCH 431/505] mm/khugepaged: fix invalid page access in release_pte_pages() release_pte_pages() converts from a pfn to a folio by using pfn_folio(). If the pte is not mapped, pfn_folio() will result in undefined behavior which ends up causing a kernel panic[1]. Only call pfn_folio() once we have validated that the pte is both valid and mapped to fix the issue. [1] https://lore.kernel.org/linux-mm/ff300770-afe9-908d-23ed-d23e0796e899@samsung.com/ Link: https://lkml.kernel.org/r/20230213214324.34215-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Fixes: 9bdfeea46f49 ("mm/khugepaged: convert release_pte_pages() to use folios") Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Debugged-by: Alexandre Ghiti Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b39ab219d5b7..bd54b957f69a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,11 +511,17 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, while (--_pte >= pte) { pte_t pteval = *_pte; + unsigned long pfn; - folio = pfn_folio(pte_pfn(pteval)); - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !folio_test_large(folio)) - release_pte_folio(folio); + if (pte_none(pteval)) + continue; + pfn = pte_pfn(pteval); + if (is_zero_pfn(pfn)) + continue; + folio = pfn_folio(pfn); + if (folio_test_large(folio)) + continue; + release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { From 6aa3a920125e9f58891e2b5dc2efd4d0c1ff05a6 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:50 -0600 Subject: [PATCH 432/505] mm/hugetlb: convert isolate_hugetlb to folios Patch series "continue hugetlb folio conversion", v3. This series continues the conversion of core hugetlb functions to use folios. This series converts many helper funtions in the hugetlb fault path. This is in preparation for another series to convert the hugetlb fault code paths to operate on folios. This patch (of 8): Convert isolate_hugetlb() to take in a folio and convert its callers to pass a folio. Use page_folio() to convert the callers to use a folio is safe as isolate_hugetlb() operates on a head page. Link: https://lkml.kernel.org/r/20230113223057.173292-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230113223057.173292-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 16 ++++++++-------- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a51e6daacac6..6e38a019f654 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); +int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,7 +413,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } diff --git a/mm/gup.c b/mm/gup.c index 25e4a3d923d6..b0885f70579c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab35b1cc9927..0c1e1ce113c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,7 +2925,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3000,7 +3000,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7250,19 +7250,19 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b4b30d9b0782..db85c2d37f70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f176..a1e8c3e9ab08 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dd5ca942256f..fc034b070645 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(page_folio(page), qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 206fcdbe67f3..f6464bce7678 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } From 6f6956cf7e6a3034f61780446547e849aa4e216d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:51 -0600 Subject: [PATCH 433/505] mm/hugetlb: convert __update_and_free_page() to folios Change __update_and_free_page() to __update_and_free_hugetlb_folio() by changing its callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c1e1ce113c8..d27fcf768548 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1698,10 +1698,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, enqueue_hugetlb_folio(h, folio); } -static void __update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_hugetlb_folio(struct hstate *h, + struct folio *folio) { int i; - struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1714,7 +1714,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, page)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1750,7 +1750,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(page, huge_page_order(h)); + __free_pages(&folio->page, huge_page_order(h)); } } @@ -1790,7 +1790,7 @@ static void free_hpage_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - __update_and_free_page(h, page); + __update_and_free_hugetlb_folio(h, page_folio(page)); cond_resched(); } @@ -1807,7 +1807,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { - __update_and_free_page(h, &folio->page); + __update_and_free_hugetlb_folio(h, folio); return; } From a36f1e9024740c3820427afca4cd375e32a1bb15 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:52 -0600 Subject: [PATCH 434/505] mm/hugetlb: convert dequeue_hugetlb_page functions to folios dequeue_huge_page_node_exact() is changed to dequeue_hugetlb_folio_node_ exact() and dequeue_huge_page_nodemask() is changed to dequeue_hugetlb_ folio_nodemask(). Update their callers to pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 56 ++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d27fcf768548..3e648fccf33e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1282,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, + int nid) { - struct page *page; + struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_longterm_pinnable_page(page)) + list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { + if (pin && !folio_is_longterm_pinnable(folio)) continue; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) continue; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - ClearHPageFreed(page); + list_move(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - return page; + return folio; } return NULL; } -static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) +static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; @@ -1320,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { - struct page *page; + struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; @@ -1332,9 +1333,9 @@ retry_cpuset: continue; node = zone_to_nid(zone); - page = dequeue_huge_page_node_exact(h, node); - if (page) - return page; + folio = dequeue_hugetlb_folio_node_exact(h, node); + if (folio) + return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; @@ -1352,7 +1353,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1374,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (!folio) + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); - if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetHPageRestoreReserve(page); + if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); - return page; + return &folio->page; err: return NULL; @@ -2475,12 +2478,13 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { - struct page *page; + struct folio *folio; - page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); - if (page) { + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + preferred_nid, nmask); + if (folio) { spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } } spin_unlock_irq(&hugetlb_lock); From 3a740e8bb56ef7ee6b9098b694caabab843be067 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:53 -0600 Subject: [PATCH 435/505] mm/hugetlb: convert alloc_surplus_huge_page() to folios Change alloc_surplus_huge_page() to alloc_surplus_hugetlb_folio() and update its callers. Link: https://lkml.kernel.org/r/20230113223057.173292-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e648fccf33e..fa61b4aa68ca 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2378,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; @@ -2416,7 +2416,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, out_unlock: spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, @@ -2449,7 +2449,7 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; @@ -2460,16 +2460,16 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + if (!folio) + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* page migration callback function */ @@ -2518,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); + struct folio *folio; struct page *page, *tmp; int ret; long i; @@ -2537,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); - if (!page) { + if (!folio) { alloc_ok = false; break; } - list_add(&page->lru, &surplus_list); + list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; @@ -3496,7 +3497,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -3539,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. From ff7d853b031302376a0d3640fa1c463d94079637 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:54 -0600 Subject: [PATCH 436/505] mm/hugetlb: increase use of folios in alloc_huge_page() Change hugetlb_cgroup_commit_charge{,_rsvd}(), dequeue_huge_page_vma() and alloc_buddy_huge_page_with_mpol() to use folios so alloc_huge_page() is cleaned by operating on folios until its return. Link: https://lkml.kernel.org/r/20230113223057.173292-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 33 ++++++++++++++++----------------- mm/hugetlb_cgroup.c | 8 ++------ 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a8063..3d82d91f49ac 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fa61b4aa68ca..5d0d1efbe590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1348,7 +1348,7 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -static struct page *dequeue_huge_page_vma(struct hstate *h, +static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) @@ -1392,7 +1392,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } mpol_cond_put(mpol); - return &folio->page; + return folio; err: return NULL; @@ -2446,7 +2446,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; @@ -2469,7 +2469,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* page migration callback function */ @@ -3018,7 +3018,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); - struct page *page; struct folio *folio; long map_chg, map_commit; long gbl_chg; @@ -3082,34 +3081,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { + folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!folio) { spin_unlock_irq(&hugetlb_lock); - page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } - list_add(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); /* Fall through */ } - folio = page_folio(page); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, page); + h_cg, folio); } spin_unlock_irq(&hugetlb_lock); - hugetlb_set_page_subpool(page, spool); + hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3130,7 +3129,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return page; + return &folio->page; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d9e4425d81ac..dedd2edb076e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } From e37d3e838d9078538f920957d1e89682b6764977 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:55 -0600 Subject: [PATCH 437/505] mm/hugetlb: convert alloc_migrate_huge_page to folios Change alloc_huge_page_nodemask() to alloc_hugetlb_folio_nodemask() and alloc_migrate_huge_page() to alloc_migrate_hugetlb_folio(). Both functions now return a folio rather than a page. Link: https://lkml.kernel.org/r/20230113223057.173292-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 18 +++++++++--------- mm/migrate.c | 5 ++++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e38a019f654..2375c62c61a4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -719,7 +719,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); @@ -1040,8 +1040,8 @@ static inline struct page *alloc_huge_page(struct vm_area_struct *vma, return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d0d1efbe590..57894beb3382 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2419,7 +2419,7 @@ out_unlock: return folio; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; @@ -2439,7 +2439,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, */ folio_set_hugetlb_temporary(folio); - return &folio->page; + return folio; } /* @@ -2472,8 +2472,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, return folio; } -/* page migration callback function */ -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +/* folio migration callback function */ +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); @@ -2484,12 +2484,12 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } } spin_unlock_irq(&hugetlb_lock); - return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* mempolicy aware migration callback */ @@ -2498,16 +2498,16 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, { struct mempolicy *mpol; nodemask_t *nodemask; - struct page *page; + struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return page; + return &folio->page; } /* diff --git a/mm/migrate.c b/mm/migrate.c index f6464bce7678..811e76c6fac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1663,6 +1663,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; + struct folio *hugetlb_folio = NULL; struct folio *new_folio = NULL; int nid; int zidx; @@ -1677,7 +1678,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + mtc->nmask, gfp_mask); + return &hugetlb_folio->page; } if (folio_test_large(folio)) { From 0ffdc38eb564c1c71a58bbaf874945ba54293ff9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:56 -0600 Subject: [PATCH 438/505] mm/hugetlb: convert restore_reserve_on_error() to folios Use the hugetlb folio flag macros inside restore_reserve_on_error() and update the comments to reflect the use of folios. Link: https://lkml.kernel.org/r/20230113223057.173292-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57894beb3382..3120c3db60c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2819,22 +2819,23 @@ static long vma_del_reservation(struct hstate *h, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { + struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); - if (HPageRestoreReserve(page)) { + if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map - * manipulation. Clear HPageRestoreReserve so that - * global reserve count will not be incremented + * manipulation. Clear hugetlb_restore_reserve so + * that global reserve count will not be incremented * by free_huge_page. This will make it appear - * as though the reservation for this page was + * as though the reservation for this folio was * consumed. This may prevent the task from - * faulting in the page at a later time. This + * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else @@ -2845,7 +2846,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * This indicates there is an entry in the reserve map * not added by alloc_huge_page. We know it was added * before the alloc_huge_page call, otherwise - * HPageRestoreReserve would be set on the page. + * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ @@ -2854,12 +2855,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * VERY rare out of memory condition. Since * we can not delete the entry, set - * HPageRestoreReserve so that the reserve - * count will be incremented when the page + * hugetlb_restore_reserve so that the reserve + * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from @@ -2875,12 +2876,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * For private mappings, no entry indicates * a reservation is present. Since we can - * not add an entry, set SetHPageRestoreReserve - * on the page so reserve count will be + * not add an entry, set hugetlb_restore_reserve + * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing From bdd7be075acb650cc57d8ee752b5375b966ad07e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 13 Jan 2023 16:30:57 -0600 Subject: [PATCH 439/505] mm/hugetlb: convert demote_free_huge_page to folios Change demote_free_huge_page to demote_free_hugetlb_folio() and change demote_pool_huge_page() pass in a folio. Link: https://lkml.kernel.org/r/20230113223057.173292-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3120c3db60c4..4ecdbad9a451 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3579,12 +3579,12 @@ out: return 0; } -static int demote_free_huge_page(struct hstate *h, struct page *page) +static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i, nid = page_to_nid(page); + int i, nid = folio_nid(folio); struct hstate *target_hstate; - struct folio *folio = page_folio(page); struct page *subpage; + struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3592,18 +3592,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, page); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (rc) { - /* Allocation of vmemmmap failed, we can not demote page */ + /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - set_page_refcounted(page); - add_hugetlb_folio(h, page_folio(page), false); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); return rc; } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count pages. + * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); @@ -3618,15 +3618,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { - subpage = nth_page(page, i); - folio = page_folio(subpage); + subpage = folio_page(folio, i); + inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(folio, + prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); - set_page_private(subpage, 0); - prep_new_hugetlb_folio(target_hstate, folio, nid); + folio_change_private(inner_folio, NULL); + prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3648,7 +3648,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct page *page; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); @@ -3659,11 +3659,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(page, &h->hugepage_freelists[node], lru) { - if (PageHWPoison(page)) + list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) continue; - - return demote_free_huge_page(h, page); + return demote_free_hugetlb_folio(h, folio); } } From ea4c353df37750d170dc0dcbfa8c47c984779733 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:30 -0800 Subject: [PATCH 440/505] mm/hugetlb: convert hugetlb_install_page to folios Patch series "convert hugetlb fault functions to folios", v2. This series converts the hugetlb page faulting functions to operate on folios. These include hugetlb_no_page(), hugetlb_wp(), copy_hugetlb_page_range(), and hugetlb_mcopy_atomic_pte(). This patch (of 8): Change hugetlb_install_page() to hugetlb_install_folio(). This reduces one user of the Huge Page flag macros which take in a page. Link: https://lkml.kernel.org/r/20230125170537.96973-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20230125170537.96973-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4ecdbad9a451..b246f2b4d0bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4946,14 +4946,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) } static void -hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct page *new_page) +hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct folio *new_folio) { - __SetPageUptodate(new_page); - hugepage_add_new_anon_rmap(new_page, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + __folio_mark_uptodate(new_folio); + hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - SetHPageMigratable(new_page); + folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5107,7 +5107,7 @@ again: /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_page(dst_vma, dst_pte, addr, new); + hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; From 91a2fb956ad993f3cbcfc632611e17e3699fb652 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:31 -0800 Subject: [PATCH 441/505] mm/hugetlb: convert hugetlbfs_pagecache_present() to folios Refactor hugetlbfs_pagecache_present() to avoid getting and dropping a refcount on a page. Use RCU and page_cache_next_miss() instead. Link: https://lkml.kernel.org/r/20230125170537.96973-3-sidhartha.kumar@oracle.com Suggested-by: Matthew Wilcox Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b246f2b4d0bd..a0d486ed5411 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5651,17 +5651,15 @@ out_release_old: static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { - struct address_space *mapping; - pgoff_t idx; - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, vma, address); + bool present; - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + rcu_read_lock(); + present = page_cache_next_miss(mapping, idx, 1) != idx; + rcu_read_unlock(); - page = find_get_page(mapping, idx); - if (page) - put_page(page); - return page != NULL; + return present; } int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, From ea8e72f4116a995c2aba3fb738ac372c4115375a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:32 -0800 Subject: [PATCH 442/505] mm/hugetlb: convert putback_active_hugepage to take in a folio Convert putback_active_hugepage() to folio_putback_active_hugetlb(), this removes one user of the Huge Page macros which take in a page. The callers in migrate.c are also cleaned up by being able to directly use the src and dst folio variables. Link: https://lkml.kernel.org/r/20230125170537.96973-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 8 ++++---- mm/migrate.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2375c62c61a4..067906c5778e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,7 +175,7 @@ int isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -429,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d486ed5411..fd1ce61b8f3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7300,13 +7300,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void putback_active_hugepage(struct page *page) +void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); - SetHPageMigratable(page); - list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + folio_set_hugetlb_migratable(folio); + list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - put_page(page); + folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) diff --git a/mm/migrate.c b/mm/migrate.c index 811e76c6fac1..c09872cf41b7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -151,7 +151,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { if (unlikely(PageHuge(page))) { - putback_active_hugepage(page); + folio_putback_active_hugetlb(page_folio(page)); continue; } list_del(&page->lru); @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1383,7 +1383,7 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1395,7 +1395,7 @@ out: if (put_new_page) put_new_page(new_hpage, private); else - putback_active_hugepage(new_hpage); + folio_putback_active_hugetlb(dst); return rc; } From d0ce0e47b323a8d7fb5dc3314ce56afa650ade2d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:33 -0800 Subject: [PATCH 443/505] mm/hugetlb: convert hugetlb fault paths to use alloc_hugetlb_folio() Change alloc_huge_page() to alloc_hugetlb_folio() by changing all callers to handle the now folio return type of the function. In this conversion, alloc_huge_page_vma() is also changed to alloc_hugetlb_folio_vma() and hugepage_add_new_anon_rmap() is changed to take in a folio directly. Many additions of '&folio->page' are cleaned up in subsequent patches. hugetlbfs_fallocate() is also refactored to use the RCU + page_cache_next_miss() API. Link: https://lkml.kernel.org/r/20230125170537.96973-5-sidhartha.kumar@oracle.com Suggested-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 40 ++++---- include/linux/hugetlb.h | 8 +- include/linux/rmap.h | 2 +- mm/hugetlb.c | 201 ++++++++++++++++++++-------------------- mm/mempolicy.c | 6 +- mm/rmap.c | 6 +- 6 files changed, 133 insertions(+), 130 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 44ecdcb796cc..f89e12106e8a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 067906c5778e..6408f85e5754 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -717,11 +717,11 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -1033,7 +1033,7 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -1047,7 +1047,7 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a6bd1f0a183d..a4570da03e58 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd1ce61b8f3f..ea8d4611779b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2493,7 +2493,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; @@ -2507,7 +2507,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return &folio->page; + return folio; } /* @@ -2798,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the page consumed the reservation. * HPageRestoreReserve is set in the page. * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2814,7 +2814,7 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) @@ -2844,8 +2844,8 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. @@ -3014,7 +3014,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); @@ -3023,7 +3023,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3130,7 +3130,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return &folio->page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add struct folio *new_folio) { __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(&new_folio->page, vma, addr); + hugepage_add_new_anon_rmap(new_folio, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); @@ -5080,34 +5080,34 @@ again: } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + &new_folio->page); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_folio(dst_vma, dst_pte, addr, page_folio(new)); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5478,7 +5478,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5539,9 +5540,9 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5586,7 +5587,7 @@ retry_avoidcopy: return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5599,9 +5600,9 @@ retry_avoidcopy: goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); @@ -5618,12 +5619,12 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5632,9 +5633,9 @@ out_release_all: * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, &new_folio->page); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5753,11 +5754,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5776,9 +5777,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5811,8 +5812,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5826,17 +5827,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5845,13 +5846,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, &folio->page); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5864,7 +5865,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5872,8 +5873,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5907,10 +5908,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5923,20 +5924,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5945,11 +5946,11 @@ out: backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, &folio->page); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -6173,16 +6174,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6193,34 +6194,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6236,25 +6237,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6269,16 +6270,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6290,10 +6291,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6304,7 +6305,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6326,20 +6327,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6871,7 +6872,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc034b070645..7686f40c9750 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1210,9 +1210,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; diff --git a/mm/rmap.c b/mm/rmap.c index 86fccc2b9fc9..8287f2cc327d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2534,15 +2534,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); - BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, page, vma, address, 1); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ From d2d7bb44bfbd29200426ba17741550d36e081f91 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:34 -0800 Subject: [PATCH 444/505] mm/hugetlb: convert restore_reserve_on_error to take in a folio Every caller of restore_reserve_on_error() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 21 ++++++++++----------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f89e12106e8a..c736947e73da 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -873,7 +873,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(&folio->page, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, &folio->page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6408f85e5754..20ceaaea1697 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -726,7 +726,7 @@ struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *v int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea8d4611779b..1f6270c586c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2802,9 +2802,9 @@ static long vma_del_reservation(struct hstate *h, * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: - * 1) A reservation was in place and the page consumed the reservation. - * HPageRestoreReserve is set in the page. - * 2) No reservation was in place for the page, so HPageRestoreReserve is + * 1) A reservation was in place and the folio consumed the reservation. + * hugetlb_restore_reserve is set in the folio. + * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the @@ -2817,9 +2817,8 @@ static long vma_del_reservation(struct hstate *h, * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page) + unsigned long address, struct folio *folio) { - struct folio *folio = page_folio(page); long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { @@ -5102,7 +5101,7 @@ again: entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - &new_folio->page); + new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; @@ -5634,7 +5633,7 @@ out_release_all: * unshare) */ if (new_folio != page_folio(old_page)) - restore_reserve_on_error(h, vma, haddr, &new_folio->page); + restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: put_page(old_page); @@ -5846,7 +5845,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); goto out; } @@ -5947,7 +5946,7 @@ backout: spin_unlock(ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, &folio->page); + restore_reserve_on_error(h, vma, haddr, folio); folio_unlock(folio); folio_put(folio); @@ -6210,7 +6209,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied @@ -6339,7 +6338,7 @@ out_release_unlock: folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } From 9b91c0e277a3dbb165c2e4301be7a231dc2f76f7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:35 -0800 Subject: [PATCH 445/505] mm/hugetlb: convert hugetlb_add_to_page_cache to take in a folio Every caller of hugetlb_add_to_page_cache() is now passing in &folio->page, change the function to take in a folio directly and clean up the call sites. Link: https://lkml.kernel.org/r/20230125170537.96973-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c736947e73da..cfd09f95551b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -871,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); - error = hugetlb_add_to_page_cache(&folio->page, mapping, index); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20ceaaea1697..df6dd624ccfe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -723,7 +723,7 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1f6270c586c0..de1f73e5e200 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5662,10 +5662,9 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return present; } -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; @@ -5677,7 +5676,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, __folio_clear_locked(folio); return err; } - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file @@ -5836,7 +5835,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -6269,7 +6268,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx); + ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; From 371607a3c793d7183b0faecc1fb4aa88fadcf202 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:36 -0800 Subject: [PATCH 446/505] mm/hugetlb: convert hugetlb_wp() to take in a folio Change the pagecache_page argument of hugetlb_wp to pagecache_folio. Replaces a call to find_lock_page() with filemap_lock_folio(). Link: https://lkml.kernel.org/r/20230125170537.96973-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: gerald.schaefer@linux.ibm.com Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de1f73e5e200..3a01a9dbf445 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5472,7 +5472,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct page *pagecache_page, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; @@ -5529,7 +5529,7 @@ retry_avoidcopy: * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_page != pagecache_page) + page_folio(old_page) != pagecache_folio) outside_reserve = 1; get_page(old_page); @@ -5922,7 +5922,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); @@ -5985,7 +5985,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; - struct page *pagecache_page = NULL; + struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; @@ -6067,7 +6067,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = find_lock_page(mapping, idx); + pagecache_folio = filemap_lock_folio(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6087,9 +6087,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }; spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -6098,11 +6098,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * pagecache_folio, so here we need take the former one + * when page != pagecache_folio or !pagecache_folio. */ page = pte_page(entry); - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) if (!trylock_page(page)) { need_wait_lock = 1; goto out_ptl; @@ -6113,7 +6113,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_page, ptl); + pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); @@ -6124,15 +6124,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) unlock_page(page); put_page(page); out_ptl: spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); From 192a50220342f82826812d0032a82fe441e924e2 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 25 Jan 2023 09:05:37 -0800 Subject: [PATCH 447/505] Documentation/mm: update hugetlbfs documentation to mention alloc_hugetlb_folio Link: https://lkml.kernel.org/r/20230125170537.96973-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Gerald Schaefer Cc: John Hubbard Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 21 ++++++++++--------- .../zh_CN/mm/hugetlbfs_reserv.rst | 14 ++++++------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index f143954e0d05..611728c49bff 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -181,14 +181,14 @@ Consuming Reservations/Allocating a Huge Page Reservations are consumed when huge pages associated with the reservations are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: +is performed within the routine alloc_hugetlb_folio():: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page is passed a VMA pointer and a virtual address, so it can +alloc_hugetlb_folio is passed a VMA pointer and a virtual address, so it can consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves +alloc_hugetlb_folio takes the argument avoid_reserve which indicates reserves should not be used even if it appears they have been set aside for the specified address. The avoid_reserve argument is most often used in the case of Copy on Write and Page Migration where additional copies of an existing @@ -208,7 +208,8 @@ a reservation for the allocation. After determining whether a reservation exists and can be used for the allocation, the routine dequeue_huge_page_vma() is called. This routine takes two arguments related to reservations: -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- avoid_reserve, this is the same value/argument passed to + alloc_hugetlb_folio(). - chg, even though this argument is of type long only the values 0 or 1 are passed to dequeue_huge_page_vma. If the value is 0, it indicates a reservation exists (see the section "Memory Policy and Reservations" for @@ -233,9 +234,9 @@ the scope reservations. Even if a surplus page is allocated, the same reservation based adjustments as above will be made: SetPagePrivate(page) and resv_huge_pages--. -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. +After obtaining a new hugetlb folio, (folio)->_hugetlb_subpool is set to the +value of the subpool associated with the page if it exists. This will be used +for subpool accounting when the folio is freed. The routine vma_commit_reservation() is then called to adjust the reserve map based on the consumption of the reservation. In general, this involves @@ -246,8 +247,8 @@ was no reservation in a shared mapping or this was a private mapping a new entry must be created. It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would +to vma_needs_reservation() at the beginning of alloc_hugetlb_folio() and the +call to vma_commit_reservation() after the folio was allocated. This would be possible if hugetlb_reserve_pages was called for the same page in a shared mapping. In such cases, the reservation count and subpool free page count will be off by one. This rare condition can be identified by comparing the diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index 752e5696cd47..826a50c47389 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -142,14 +142,14 @@ HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 消耗预留/分配一个巨页 =========================== -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_hugetlb_folio() 中进行的:: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +alloc_hugetlb_folio被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_hugetlb_folio需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 外拷贝被分配。 @@ -162,7 +162,7 @@ vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留 确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 的参数: -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- avoid_reserve,这是传递给alloc_hugetlb_folio()的同一个值/参数。 - chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 @@ -179,7 +179,7 @@ free_huge_pages的值被递减。如果有一个与该页相关的预留,将 的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: SetPagePrivate(page) 和 resv_huge_pages--. -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +在获得一个新的巨页后,(folio)->_hugetlb_subpool被设置为与该页面相关的子池的值,如果它存在的话。当页 面被释放时,这将被用于子池的计数。 然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 @@ -199,7 +199,7 @@ SetPagePrivate(page)和resv_huge_pages-。 已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 一个新的条目。 -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +在alloc_hugetlb_folio()开始调用vma_needs_reservation()和页面分配后调用 vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 From fa4e3f5ffa5e6e22f751d289c9afa502dda30b8d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:28 -0800 Subject: [PATCH 448/505] mm: add folio_estimated_sharers() Patch series "Convert various mempolicy.c functions to use folios", v4. This patch series converts migrate_page_add() and queue_pages_required() to migrate_folio_add() and queue_page_required(). It also converts the callers of the functions to use folios as well, and introduces a helper function to estimate the number of sharers of a folio. This patch (of 6): folio_estimated_sharers() takes in a folio and returns the precise number of times the first subpage of the folio is mapped. This function aims to provide an estimate for the number of sharers of a folio. This is necessary for folio conversions where we care about the number of processes that share a folio, but don't necessarily want to check every single page within that folio. This is in contrast to folio_mapcount() which calculates the total number of the times a folio and all its subpages are mapped. Link: https://lkml.kernel.org/r/20230130201833.27042-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130201833.27042-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Acked-by: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9454b7eb055b..89c118ad4a44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1916,6 +1916,24 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ + return page_mapcount(folio_page(folio, 0)); +} + #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { From de1f5055523e9a035b38533f25a56df03d45034a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:29 -0800 Subject: [PATCH 449/505] mm/mempolicy: convert queue_pages_pmd() to queue_folios_pmd() The function now operates on a folio instead of the page associated with a pmd. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7686f40c9750..fc754dbcbbcd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -442,21 +442,21 @@ static inline bool queue_pages_required(struct page *page, } /* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the + * existing folio was already on a node that does not follow the * policy. */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags; @@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; goto unlock; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) goto unlock; flags = qp->flags; - /* go to thp migration */ + /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { + migrate_page_add(&folio->page, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + return queue_folios_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; From 3dae02bbd07f40e37bbfec2d77119628db461eaa Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:30 -0800 Subject: [PATCH 450/505] mm/mempolicy: convert queue_pages_pte_range() to queue_folios_pte_range() This function now operates on folios associated with ptes instead of pages. This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc754dbcbbcd..b0805bb87655 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -491,19 +491,19 @@ unlock: * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; bool has_unmovable = false; @@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_pages_required(&folio->page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(page, qp->pagelist, flags)) + if (migrate_page_add(&folio->page, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -704,7 +704,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; From 0a2c1e8183163a31fe8c9838f3108aacf9c05c4a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:31 -0800 Subject: [PATCH 451/505] mm/mempolicy: convert queue_pages_hugetlb() to queue_folios_hugetlb() This change is in preparation for the conversion of queue_pages_required() to queue_folio_required() and migrate_page_add() to migrate_folio_add(). Link: https://lkml.kernel.org/r/20230130201833.27042-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b0805bb87655..668392493500 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -558,7 +558,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return addr != end ? -EIO : 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) + folio = pfn_folio(pte_pfn(entry)); + if (!queue_pages_required(&folio->page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* - * STRICT alone means only detecting misplaced page and no + * STRICT alone means only detecting misplaced folio and no * need to further check other vma. */ ret = -EIO; @@ -591,21 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which + * Detecting misplaced folio but allow migrating folios which * have been queued. */ ret = 1; goto unlock; } - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + /* + * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it + * is shared it is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page_folio(page), qp->pagelist) && + if (isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* - * Failed to isolate page but allow migrating pages + * Failed to isolate folio but allow migrating pages * which have been queued. */ ret = 1; @@ -703,7 +710,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, + .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; From d451b89dcd183da725eda84dfb8a46c0b32a4234 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:32 -0800 Subject: [PATCH 452/505] mm/mempolicy: convert queue_pages_required() to queue_folio_required() Replace queue_pages_required() with queue_folio_required(). queue_folio_required() does the same as queue_pages_required(), except takes in a folio instead of a page. Link: https://lkml.kernel.org/r/20230130201833.27042-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Jane Chu Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 668392493500..6a68dbce3b70 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -427,15 +427,15 @@ struct queue_pages { }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); @@ -469,7 +469,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; flags = qp->flags; @@ -530,7 +530,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, */ if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -575,7 +575,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (!pte_present(entry)) goto unlock; folio = pfn_folio(pte_pfn(entry)); - if (!queue_pages_required(&folio->page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { From 4a64981dfee9119aa2c1f243b48f34cbbd67779c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 12:18:33 -0800 Subject: [PATCH 453/505] mm/mempolicy: convert migrate_page_add() to migrate_folio_add() Replace migrate_page_add() with migrate_folio_add(). migrate_folio_add() does the same a migrate_page_add() but takes in a folio instead of a page. This removes a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20230130201833.27042-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Yin Fengwei Cc: David Hildenbrand Cc: Jane Chu Signed-off-by: Andrew Morton --- mm/mempolicy.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6a68dbce3b70..0919c7a719d4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); struct queue_pages { @@ -476,7 +476,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(&folio->page, qp->pagelist, flags)) { + migrate_folio_add(folio, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -544,7 +544,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(&folio->page, qp->pagelist, flags)) + if (migrate_folio_add(folio, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -1021,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } else if (flags & MPOL_MF_STRICT) { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ @@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start) } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return -EIO; From 3c1ea2c729ef8ef07bcb80d01ab2ead45b3406dd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:49 -0800 Subject: [PATCH 454/505] mm: add folio_get_nontail_page() Patch series "Convert a couple migrate functions to use folios", v2. This patchset introduces folio_movable_ops() and converts 3 functions in mm/migrate.c to use folios. It also introduces folio_get_nontail_page() for folio conversions which may want to distinguish between head and tail pages. This patch (of 4): folio_get_nontail_page() returns the folio associated with a head page. This is necessary for folio conversions where the behavior of that function differs between head pages and tail pages. Link: https://lkml.kernel.org/r/20230130214352.40538-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230130214352.40538-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 89c118ad4a44..2992a2d55aee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,6 +892,13 @@ static inline bool get_page_unless_zero(struct page *page) return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { From da707a6d184a8a6ef0b756c3ba49888fec223793 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:50 -0800 Subject: [PATCH 455/505] mm/migrate: add folio_movable_ops() folio_movable_ops() does the same as page_movable_ops() except uses folios instead of pages. This function will help make folio conversions in migrate.c more readable. Link: https://lkml.kernel.org/r/20230130214352.40538-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/migrate.h | 9 +++++++++ mm/migrate.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f0..bdff950a8bb4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -122,6 +122,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *folio_movable_ops(struct folio *folio) +{ + VM_BUG_ON(!__folio_test_movable(folio)); + + return (const struct movable_operations *) + ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE); +} + static inline const struct movable_operations *page_movable_ops(struct page *page) { diff --git a/mm/migrate.c b/mm/migrate.c index c09872cf41b7..d2b1167329b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -990,7 +990,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - mops = page_movable_ops(&src->page); + mops = folio_movable_ops(src); rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); From 19979497c02a365ed9d8276b5f4cc36557a13ced Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:51 -0800 Subject: [PATCH 456/505] mm/migrate: convert isolate_movable_page() to use folios Removes 6 calls to compound_head() and prepares the function to take in a folio instead of page argument. Link: https://lkml.kernel.org/r/20230130214352.40538-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d2b1167329b9..3cdb76e44ef5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -60,6 +60,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { + struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* @@ -71,11 +72,11 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ - if (unlikely(!get_page_unless_zero(page))) + if (!folio) goto out; - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ smp_rmb(); /* @@ -83,12 +84,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. */ - if (unlikely(!__PageMovable(page))) - goto out_putpage; + if (unlikely(!__folio_test_movable(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ smp_rmb(); - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -101,29 +102,29 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ - if (unlikely(!trylock_page(page))) - goto out_putpage; + if (unlikely(!folio_trylock(folio))) + goto out_putfolio; - if (!PageMovable(page) || PageIsolated(page)) + if (!folio_test_movable(folio) || folio_test_isolated(folio)) goto out_no_isolated; - mops = page_movable_ops(page); - VM_BUG_ON_PAGE(!mops, page); + mops = folio_movable_ops(folio); + VM_BUG_ON_FOLIO(!mops, folio); - if (!mops->isolate_page(page, mode)) + if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ - WARN_ON_ONCE(PageIsolated(page)); - SetPageIsolated(page); - unlock_page(page); + WARN_ON_ONCE(folio_test_isolated(folio)); + folio_set_isolated(folio); + folio_unlock(folio); return 0; out_no_isolated: - unlock_page(page); -out_putpage: - put_page(page); + folio_unlock(folio); +out_putfolio: + folio_put(folio); out: return -EBUSY; } From 280d724ac20f9cc463d4ab8e2269f598476b070f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 30 Jan 2023 13:43:52 -0800 Subject: [PATCH 457/505] mm/migrate: convert putback_movable_pages() to use folios Removes 6 calls to compound_head(), and replaces putback_movable_page() with putback_movable_folio() as well. Link: https://lkml.kernel.org/r/20230130214352.40538-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 3cdb76e44ef5..5b40b9040ba6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -129,12 +129,12 @@ out: return -EBUSY; } -static void putback_movable_page(struct page *page) +static void putback_movable_folio(struct folio *folio) { - const struct movable_operations *mops = page_movable_ops(page); + const struct movable_operations *mops = folio_movable_ops(folio); - mops->putback_page(page); - ClearPageIsolated(page); + mops->putback_page(&folio->page); + folio_clear_isolated(folio); } /* @@ -147,33 +147,33 @@ static void putback_movable_page(struct page *page) */ void putback_movable_pages(struct list_head *l) { - struct page *page; - struct page *page2; + struct folio *folio; + struct folio *folio2; - list_for_each_entry_safe(page, page2, l, lru) { - if (unlikely(PageHuge(page))) { - folio_putback_active_hugetlb(page_folio(page)); + list_for_each_entry_safe(folio, folio2, l, lru) { + if (unlikely(folio_test_hugetlb(folio))) { + folio_putback_active_hugetlb(folio); continue; } - list_del(&page->lru); + list_del(&folio->lru); /* - * We isolated non-lru movable page so here we can use - * __PageMovable because LRU page's mapping cannot have + * We isolated non-lru movable folio so here we can use + * __PageMovable because LRU folio's mapping cannot have * PAGE_MAPPING_MOVABLE. */ - if (unlikely(__PageMovable(page))) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); + if (unlikely(__folio_test_movable(folio))) { + VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); + folio_lock(folio); + if (folio_test_movable(folio)) + putback_movable_folio(folio); else - ClearPageIsolated(page); - unlock_page(page); - put_page(page); + folio_clear_isolated(folio); + folio_unlock(folio); + folio_put(folio); } else { - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); - putback_lru_page(page); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -folio_nr_pages(folio)); + folio_putback_lru(folio); } } } From 5445fcbc4cda770cd07f49624704fabcc284d563 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:07 +0000 Subject: [PATCH 458/505] Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice Patch series "mm/damon: deprecate DAMON debugfs interface". DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And v6.1.y has been announced to be an LTS[1]. Though the announcement was there for a while, some people might not have noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, keep the code and documents with warning messages and contacts to ask helps for the deprecation. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c This patch (of 3): DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, note DAMON debugfs interface as deprecated, and contacts to ask helps on the document. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c Link: https://lkml.kernel.org/r/20230209192009.7885-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230209192009.7885-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9237d6a25897..9b823fec974d 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -25,10 +25,12 @@ DAMON provides below interfaces for different users. interface provides only simple :ref:`statistics ` for the monitoring results. For detailed monitoring results, DAMON provides a :ref:`tracepoint `. -- *debugfs interface.* +- *debugfs interface. (DEPRECATED!)* :ref:`This ` is almost identical to :ref:`sysfs interface - `. This will be removed after next LTS kernel is released, - so users should move to the :ref:`sysfs interface `. + `. This is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. - *Kernel Space Programming Interface.* :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by @@ -487,13 +489,17 @@ the files as above. Above is only for an example. .. _debugfs_interface: -debugfs Interface -================= +debugfs Interface (DEPRECATED!) +=============================== .. note:: - DAMON debugfs interface will be removed after next LTS kernel is released, so - users should move to the :ref:`sysfs interface `. + THIS IS DEPRECATED! + + DAMON debugfs interface is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and From 61e88a2f66580d7488bbf7454423c81886d2e8cd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:08 +0000 Subject: [PATCH 459/505] mm/damon/Kconfig: add DAMON debugfs interface deprecation notice DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, note DAMON debugfs interface as deprecated, and contacts to ask helps on the Kconfig. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c Link: https://lkml.kernel.org/r/20230209192009.7885-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 7821fcb3f258..436c6b4cb5ec 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -60,7 +60,7 @@ config DAMON_SYSFS the interface for arbitrary data access monitoring. config DAMON_DBGFS - bool "DAMON debugfs interface" + bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help This builds the debugfs interface for DAMON. The user space admins @@ -68,8 +68,9 @@ config DAMON_DBGFS If unsure, say N. - This will be removed after >5.15.y LTS kernel is released, so users - should move to the sysfs interface (DAMON_SYSFS). + This is deprecated, so users should move to the sysfs interface + (DAMON_SYSFS). If you depend on this and cannot move, please report + your usecase to damon@lists.linux.dev and linux-mm@kvack.org. config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS From 620932cd285208ef3009ac338b1eeed13ccd1753 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Feb 2023 19:20:09 +0000 Subject: [PATCH 460/505] mm/damon/dbgfs: print DAMON debugfs interface deprecation message DAMON debugfs interface has announced to be deprecated after >v5.15 LTS kernel is released. And, v6.1.y has announced to be an LTS[1]. Though the announcement was there for a while, some people might not noticed that so far. Also, some users could depend on it and have problems at movng to the alternative (DAMON sysfs interface). For such cases, warn DAMON debugfs interface deprecation with contacts to ask helps when any DAMON debugfs interface file is opened. [1] https://git.kernel.org/pub/scm/docs/kernel/website.git/commit/?id=332e9121320bc7461b2d3a79665caf153e51732c [sj@kernel.org: split DAMON debugfs file open warning message, per Randy] Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org Link: https://lkml.kernel.org/r/20230210044838.63723-4-sj@kernel.org Link: https://lkml.kernel.org/r/20230209192009.7885-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index b3f454a5c682..124f0f8c97b7 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -20,6 +20,14 @@ static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; static DEFINE_MUTEX(damon_dbgfs_lock); +static void damon_dbgfs_warn_deprecation(void) +{ + pr_warn_once("DAMON debugfs interface is deprecated, " + "so users should move to DAMON_SYSFS. If you cannot, " + "please report your usecase to damon@lists.linux.dev and " + "linux-mm@kvack.org.\n"); +} + /* * Returns non-empty string on success, negative error code otherwise. */ @@ -711,6 +719,8 @@ out: static int damon_dbgfs_open(struct inode *inode, struct file *file) { + damon_dbgfs_warn_deprecation(); + file->private_data = inode->i_private; return nonseekable_open(inode, file); @@ -1039,15 +1049,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return ret; } +static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) +{ + damon_dbgfs_warn_deprecation(); + return nonseekable_open(inode, file); +} + static const struct file_operations mk_contexts_fops = { + .open = damon_dbgfs_static_file_open, .write = dbgfs_mk_context_write, }; static const struct file_operations rm_contexts_fops = { + .open = damon_dbgfs_static_file_open, .write = dbgfs_rm_context_write, }; static const struct file_operations monitor_on_fops = { + .open = damon_dbgfs_static_file_open, .read = dbgfs_monitor_on_read, .write = dbgfs_monitor_on_write, }; From 6bdfc60cf0f9771d592a006dcd2cf6e40e1ccd79 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Fri, 10 Feb 2023 21:33:16 +0100 Subject: [PATCH 461/505] mm: fix typo in __vm_enough_memory warning Link: https://lkml.kernel.org/r/20230210203316.5613-1-jwilk@jwilk.net Signed-off-by: Jakub Wilk Acked-by: Mike Rapoport (IBM) Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index cec9327b27b4..b8ed9dbc7fd5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -967,7 +967,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: - pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n", + pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", __func__, current->pid, current->comm); vm_unacct_memory(pages); From 15ef6a982f40a2b53b057dad24f00c3fb43e7e70 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:49 +0100 Subject: [PATCH 462/505] lib/stackdepot: put functions in logical order Patch series "lib/stackdepot: fixes and clean-ups", v2. A set of fixes, comments, and clean-ups I came up with while reading the stack depot code. This patch (of 18): Put stack depot functions' declarations and definitions in a more logical order: 1. Functions that save stack traces into stack depot. 2. Functions that fetch and print stack traces. 3. stack_depot_get_extra_bits that operates on stack depot handles and does not interact with the stack depot storage. No functional changes. Link: https://lkml.kernel.org/r/cover.1676063693.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/daca1319b665d826b94c596b992a8d8117846147.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Evgenii Stepanov Cc: Marco Elver Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 15 +- lib/stackdepot.c | 314 ++++++++++++++++++------------------- 2 files changed, 165 insertions(+), 164 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 9ca7798d7a31..1296a6eeaec0 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,17 +14,13 @@ #include typedef u32 depot_stack_handle_t; + /* * Number of bits in the handle that stack depot doesn't use. Users may store * information in them. */ #define STACK_DEPOT_EXTRA_BITS 5 -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - unsigned int extra_bits, - gfp_t gfp_flags, bool can_alloc); - /* * Every user of stack depot has to call stack_depot_init() during its own init * when it's decided that it will be calling stack_depot_save() later. This is @@ -59,17 +55,22 @@ static inline void stack_depot_want_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + unsigned int extra_bits, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); +void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -void stack_depot_print(depot_stack_handle_t stack); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 79e894cf8406..4bfaf3bce619 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -79,84 +79,6 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) -{ - union handle_parts parts = { .handle = handle }; - - return parts.extra; -} -EXPORT_SYMBOL(stack_depot_get_extra_bits); - -static bool init_stack_slab(void **prealloc) -{ - if (!*prealloc) - return false; - /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). - */ - if (smp_load_acquire(&next_slab_inited)) - return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; - *prealloc = NULL; - } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; - *prealloc = NULL; - } - /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). - */ - smp_store_release(&next_slab_inited, 1); - } - return true; -} - -/* Allocation of a new stack in raw storage */ -static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) -{ - struct stack_record *stack; - size_t required_size = struct_size(stack, entries, size); - - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return NULL; - } - depot_index++; - depot_offset = 0; - /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). - */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); - } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) - return NULL; - - stack = stack_slabs[depot_index] + depot_offset; - - stack->hash = hash; - stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; - stack->handle.valid = 1; - stack->handle.extra = 0; - memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; - - return stack; -} - /* one hash table bucket entry per 16kB of memory */ #define STACK_HASH_SCALE 14 /* limited between 4k and 1M buckets */ @@ -270,6 +192,76 @@ int stack_depot_init(void) } EXPORT_SYMBOL_GPL(stack_depot_init); +static bool init_stack_slab(void **prealloc) +{ + if (!*prealloc) + return false; + /* + * This smp_load_acquire() pairs with smp_store_release() to + * |next_slab_inited| below and in depot_alloc_stack(). + */ + if (smp_load_acquire(&next_slab_inited)) + return true; + if (stack_slabs[depot_index] == NULL) { + stack_slabs[depot_index] = *prealloc; + *prealloc = NULL; + } else { + /* If this is the last depot slab, do not touch the next one. */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { + stack_slabs[depot_index + 1] = *prealloc; + *prealloc = NULL; + } + /* + * This smp_store_release pairs with smp_load_acquire() from + * |next_slab_inited| above and in stack_depot_save(). + */ + smp_store_release(&next_slab_inited, 1); + } + return true; +} + +/* Allocation of a new stack in raw storage */ +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +{ + struct stack_record *stack; + size_t required_size = struct_size(stack, entries, size); + + required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + + if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return NULL; + } + depot_index++; + depot_offset = 0; + /* + * smp_store_release() here pairs with smp_load_acquire() from + * |next_slab_inited| in stack_depot_save() and + * init_stack_slab(). + */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) + smp_store_release(&next_slab_inited, 0); + } + init_stack_slab(prealloc); + if (stack_slabs[depot_index] == NULL) + return NULL; + + stack = stack_slabs[depot_index] + depot_offset; + + stack->hash = hash; + stack->size = size; + stack->handle.slabindex = depot_index; + stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.valid = 1; + stack->handle.extra = 0; + memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); + depot_offset += required_size; + + return stack; +} + /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -309,85 +301,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * stack_depot_snprint - print stack entries from a depot into a buffer - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @buf: Pointer to the print buffer - * - * @size: Size of the print buffer - * - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed. - */ -int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, - int spaces) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(handle, &entries); - return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, - spaces) : 0; -} -EXPORT_SYMBOL_GPL(stack_depot_snprint); - -/** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). - * - */ -void stack_depot_print(depot_stack_handle_t stack) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(stack, &entries); - if (nr_entries > 0) - stack_trace_print(entries, nr_entries, 0); -} -EXPORT_SYMBOL_GPL(stack_depot_print); - -/** - * stack_depot_fetch - Fetch stack entries from a depot - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address - * - * Return: The number of trace entries for this depot. - */ -unsigned int stack_depot_fetch(depot_stack_handle_t handle, - unsigned long **entries) -{ - union handle_parts parts = { .handle = handle }; - void *slab; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; - struct stack_record *stack; - - *entries = NULL; - if (!handle) - return 0; - - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); - return 0; - } - slab = stack_slabs[parts.slabindex]; - if (!slab) - return 0; - stack = slab + offset; - - *entries = stack->entries; - return stack->size; -} -EXPORT_SYMBOL_GPL(stack_depot_fetch); - /** * __stack_depot_save - Save a stack trace from an array * @@ -533,3 +446,90 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); + +/** + * stack_depot_fetch - Fetch stack entries from a depot + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @entries: Pointer to store the entries address + * + * Return: The number of trace entries for this depot. + */ +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries) +{ + union handle_parts parts = { .handle = handle }; + void *slab; + size_t offset = parts.offset << STACK_ALLOC_ALIGN; + struct stack_record *stack; + + *entries = NULL; + if (!handle) + return 0; + + if (parts.slabindex > depot_index) { + WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", + parts.slabindex, depot_index, handle); + return 0; + } + slab = stack_slabs[parts.slabindex]; + if (!slab) + return 0; + stack = slab + offset; + + *entries = stack->entries; + return stack->size; +} +EXPORT_SYMBOL_GPL(stack_depot_fetch); + +/** + * stack_depot_print - print stack entries from a depot + * + * @stack: Stack depot handle which was returned from + * stack_depot_save(). + * + */ +void stack_depot_print(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + if (nr_entries > 0) + stack_trace_print(entries, nr_entries, 0); +} +EXPORT_SYMBOL_GPL(stack_depot_print); + +/** + * stack_depot_snprint - print stack entries from a depot into a buffer + * + * @handle: Stack depot handle which was returned from + * stack_depot_save(). + * @buf: Pointer to the print buffer + * + * @size: Size of the print buffer + * + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, + int spaces) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(handle, &entries); + return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, + spaces) : 0; +} +EXPORT_SYMBOL_GPL(stack_depot_snprint); + +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); From 4a6b5314d6bd9093dcc3c0c8e185af7df9a0fe34 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:50 +0100 Subject: [PATCH 463/505] lib/stackdepot: use pr_fmt to define message format Use pr_fmt to define the format for printing stack depot messages instead of duplicating the "Stack Depot" prefix in each message. Link: https://lkml.kernel.org/r/3d09db0171a0e92ff3eb0ee74de74558bc9b56c4.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4bfaf3bce619..83787e46a3ab 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -19,6 +19,8 @@ * Based on code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "stackdepot: " fmt + #include #include #include @@ -98,7 +100,7 @@ static int __init is_stack_depot_disabled(char *str) ret = kstrtobool(str, &stack_depot_disable); if (!ret && stack_depot_disable) { - pr_info("Stack Depot is disabled\n"); + pr_info("disabled\n"); stack_table = NULL; } return 0; @@ -142,7 +144,7 @@ int __init stack_depot_early_init(void) 1UL << STACK_HASH_ORDER_MAX); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; return -ENOMEM; } @@ -177,11 +179,11 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries with kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); + pr_err("hash table allocation failed, disabling\n"); stack_depot_disable = true; ret = -ENOMEM; } From 1c0310add78e7e47e3357c24369b61453a5a72eb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:51 +0100 Subject: [PATCH 464/505] lib/stackdepot, mm: rename stack_depot_want_early_init Rename stack_depot_want_early_init to stack_depot_request_early_init. The old name is confusing, as it hints at returning some kind of intention of stack depot. The new name reflects that this function requests an action from stack depot instead. No functional changes. [akpm@linux-foundation.org: update mm/kmemleak.c] Link: https://lkml.kernel.org/r/359f31bf67429a06e630b4395816a967214ef753.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 +++++++------- lib/stackdepot.c | 10 +++++----- mm/kmemleak.c | 2 +- mm/page_owner.c | 2 +- mm/slub.c | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 1296a6eeaec0..c4e3abc16b16 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -31,26 +31,26 @@ typedef u32 depot_stack_handle_t; * enabled as part of mm_init(), for subsystems where it's known at compile time * that stack depot will be used. * - * Another alternative is to call stack_depot_want_early_init(), when the + * Another alternative is to call stack_depot_request_early_init(), when the * decision to use stack depot is taken e.g. when evaluating kernel boot * parameters, which precedes the enablement point in mm_init(). * - * stack_depot_init() and stack_depot_want_early_init() can be called regardless - * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print - * functions should only be called from code that makes sure CONFIG_STACKDEPOT - * is enabled. + * stack_depot_init() and stack_depot_request_early_init() can be called + * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual + * save/fetch/print functions should only be called from code that makes sure + * CONFIG_STACKDEPOT is enabled. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -void __init stack_depot_want_early_init(void); +void __init stack_depot_request_early_init(void); /* This is supposed to be called only from mm_init() */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } -static inline void stack_depot_want_early_init(void) { } +static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 83787e46a3ab..136706efe339 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,7 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; -static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; @@ -107,12 +107,12 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -void __init stack_depot_want_early_init(void) +void __init stack_depot_request_early_init(void) { - /* Too late to request early init now */ + /* Too late to request early init now. */ WARN_ON(__stack_depot_early_init_passed); - __stack_depot_want_early_init = true; + __stack_depot_early_init_requested = true; } int __init stack_depot_early_init(void) @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_want_early_init || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disable) return 0; if (stack_hash_order) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d9b242cfdb1c..a2d34226e3c8 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2071,7 +2071,7 @@ static int __init kmemleak_boot_config(char *str) kmemleak_disable(); else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; - stack_depot_want_early_init(); + stack_depot_request_early_init(); } else return -EINVAL; diff --git a/mm/page_owner.c b/mm/page_owner.c index 80dc8f4050fa..220cdeddc295 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -48,7 +48,7 @@ static int __init early_page_owner_param(char *buf) int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) - stack_depot_want_early_init(); + stack_depot_request_early_init(); return ret; } diff --git a/mm/slub.c b/mm/slub.c index 67020074ecb4..f8dba33e4d15 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1592,7 +1592,7 @@ static int __init setup_slub_debug(char *str) } else { slab_list_specified = true; if (flags & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); } } @@ -1611,7 +1611,7 @@ static int __init setup_slub_debug(char *str) out: slub_debug = global_flags; if (slub_debug & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else From 735df3c3a3493cbedfa86739eec6e2ee37fe95f8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:52 +0100 Subject: [PATCH 465/505] lib/stackdepot: rename stack_depot_disable Rename stack_depot_disable to stack_depot_disabled to make its name look similar to the names of other stack depot flags. Also put stack_depot_disabled's definition together with the other flags. Also rename is_stack_depot_disabled to disable_stack_depot: this name looks more conventional for a function that processes a boot parameter. No functional changes. Link: https://lkml.kernel.org/r/d78a07d222e689926e5ead229e4a2e3d87dc9aa7.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 136706efe339..202e07c4f02d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,6 +71,7 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; +static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; @@ -91,21 +92,20 @@ static DEFINE_RAW_SPINLOCK(depot_lock); static unsigned int stack_hash_order; static unsigned int stack_hash_mask; -static bool stack_depot_disable; static struct stack_record **stack_table; -static int __init is_stack_depot_disabled(char *str) +static int __init disable_stack_depot(char *str) { int ret; - ret = kstrtobool(str, &stack_depot_disable); - if (!ret && stack_depot_disable) { + ret = kstrtobool(str, &stack_depot_disabled); + if (!ret && stack_depot_disabled) { pr_info("disabled\n"); stack_table = NULL; } return 0; } -early_param("stack_depot_disable", is_stack_depot_disabled); +early_param("stack_depot_disable", disable_stack_depot); void __init stack_depot_request_early_init(void) { @@ -128,7 +128,7 @@ int __init stack_depot_early_init(void) if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; - if (!__stack_depot_early_init_requested || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; if (stack_hash_order) @@ -145,7 +145,7 @@ int __init stack_depot_early_init(void) if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; return -ENOMEM; } @@ -158,7 +158,7 @@ int stack_depot_init(void) int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disable && !stack_table) { + if (!stack_depot_disabled && !stack_table) { unsigned long entries; int scale = STACK_HASH_SCALE; @@ -184,7 +184,7 @@ int stack_depot_init(void) stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); - stack_depot_disable = true; + stack_depot_disabled = true; ret = -ENOMEM; } stack_hash_mask = entries - 1; @@ -353,7 +353,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, */ nr_entries = filter_irq_stacks(entries, nr_entries); - if (unlikely(nr_entries == 0) || stack_depot_disable) + if (unlikely(nr_entries == 0) || stack_depot_disabled) goto fast_exit; hash = hash_stack(entries, nr_entries); From df225c877d898992740d26250727a16db65c370d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:53 +0100 Subject: [PATCH 466/505] lib/stackdepot: annotate init and early init functions Add comments to stack_depot_early_init and stack_depot_init to explain certain parts of their implementation. Also add a pr_info message to stack_depot_early_init similar to the one in stack_depot_init. Also move the scale variable in stack_depot_init to the scope where it is being used. Link: https://lkml.kernel.org/r/d17fbfbd4d73f38686c5e3d4824a6d62047213a1.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 202e07c4f02d..9fab711e4826 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -115,24 +115,34 @@ void __init stack_depot_request_early_init(void) __stack_depot_early_init_requested = true; } +/* Allocates a hash table via memblock. Can only be used during early boot. */ int __init stack_depot_early_init(void) { unsigned long entries = 0; - /* This is supposed to be called only once, from mm_init() */ + /* This function must be called only once, from mm_init(). */ if (WARN_ON(__stack_depot_early_init_passed)) return 0; - __stack_depot_early_init_passed = true; + /* + * If KASAN is enabled, use the maximum order: KASAN is frequently used + * in fuzzing scenarios, which leads to a large number of different + * stack traces being stored in stack depot. + */ if (kasan_enabled() && !stack_hash_order) stack_hash_order = STACK_HASH_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; + /* + * If stack_hash_order is not set, leave entries as 0 to rely on the + * automatic calculations performed by alloc_large_system_hash. + */ if (stack_hash_order) - entries = 1UL << stack_hash_order; + entries = 1UL << stack_hash_order; + pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, @@ -142,7 +152,6 @@ int __init stack_depot_early_init(void) &stack_hash_mask, 1UL << STACK_HASH_ORDER_MIN, 1UL << STACK_HASH_ORDER_MAX); - if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -152,6 +161,7 @@ int __init stack_depot_early_init(void) return 0; } +/* Allocates a hash table via kvcalloc. Can be used after boot. */ int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); @@ -160,11 +170,16 @@ int stack_depot_init(void) mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disabled && !stack_table) { unsigned long entries; - int scale = STACK_HASH_SCALE; + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ if (stack_hash_order) { entries = 1UL << stack_hash_order; } else { + int scale = STACK_HASH_SCALE; + entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -179,7 +194,7 @@ int stack_depot_init(void) if (entries > 1UL << STACK_HASH_ORDER_MAX) entries = 1UL << STACK_HASH_ORDER_MAX; - pr_info("allocating hash table of %lu entries with kvcalloc\n", + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); if (!stack_table) { From c60324fbf05d9b1dd3231f5373d4bf31cc23db07 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:54 +0100 Subject: [PATCH 467/505] lib/stackdepot: lower the indentation in stack_depot_init stack_depot_init does most things inside an if check. Move them out and use a goto statement instead. No functional changes. Link: https://lkml.kernel.org/r/8e382f1f0c352e4b2ad47326fec7782af961fe8e.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 66 +++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 9fab711e4826..3c713f70b0a3 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -165,46 +165,50 @@ int __init stack_depot_early_init(void) int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + unsigned long entries; int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disabled && !stack_table) { - unsigned long entries; - /* - * Similarly to stack_depot_early_init, use stack_hash_order - * if assigned, and rely on automatic scaling otherwise. - */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; - } else { - int scale = STACK_HASH_SCALE; + if (stack_depot_disabled || stack_table) + goto out_unlock; - entries = nr_free_buffer_pages(); - entries = roundup_pow_of_two(entries); + /* + * Similarly to stack_depot_early_init, use stack_hash_order + * if assigned, and rely on automatic scaling otherwise. + */ + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + int scale = STACK_HASH_SCALE; - if (scale > PAGE_SHIFT) - entries >>= (scale - PAGE_SHIFT); - else - entries <<= (PAGE_SHIFT - scale); - } + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; - - pr_info("allocating hash table of %lu entries via kvcalloc\n", - entries); - stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); - if (!stack_table) { - pr_err("hash table allocation failed, disabling\n"); - stack_depot_disabled = true; - ret = -ENOMEM; - } - stack_hash_mask = entries - 1; + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { + pr_err("hash table allocation failed, disabling\n"); + stack_depot_disabled = true; + ret = -ENOMEM; + goto out_unlock; + } + stack_hash_mask = entries - 1; + +out_unlock: mutex_unlock(&stack_depot_init_mutex); + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); From 0d249ac0e07680960929a2d4f7b32be505c8c7a1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:55 +0100 Subject: [PATCH 468/505] lib/stackdepot: reorder and annotate global variables Group stack depot global variables by their purpose: 1. Hash table-related variables, 2. Slab-related variables, and add comments. Also clean up comments for hash table-related constants. Link: https://lkml.kernel.org/r/5606a6c70659065a25bee59cd10e57fc60bb4110.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 3c713f70b0a3..de1afe3fb24d 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -75,24 +75,31 @@ static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; - -static int depot_index; -static int next_slab_inited; -static size_t depot_offset; -static DEFINE_RAW_SPINLOCK(depot_lock); - -/* one hash table bucket entry per 16kB of memory */ +/* Use one hash table bucket per 16 KB of memory. */ #define STACK_HASH_SCALE 14 -/* limited between 4k and 1M buckets */ +/* Limit the number of buckets between 4K and 1M. */ #define STACK_HASH_ORDER_MIN 12 #define STACK_HASH_ORDER_MAX 20 +/* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c +/* Hash table of pointers to stored stack traces. */ +static struct stack_record **stack_table; +/* Fixed order of the number of table buckets. Used when KASAN is enabled. */ static unsigned int stack_hash_order; +/* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; -static struct stack_record **stack_table; +/* Array of memory regions that store stack traces. */ +static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; +/* Currently used slab in stack_slabs. */ +static int depot_index; +/* Offset to the unused space in the currently used slab. */ +static size_t depot_offset; +/* Lock that protects the variables above. */ +static DEFINE_RAW_SPINLOCK(depot_lock); +/* Whether the next slab is initialized. */ +static int next_slab_inited; static int __init disable_stack_depot(char *str) { From 4c2e9a679468a5bef2100504914481c6ddf0d9bd Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:56 +0100 Subject: [PATCH 469/505] lib/stackdepot: rename hash table constants and variables Give more meaningful names to hash table-related constants and variables: 1. Rename STACK_HASH_SCALE to STACK_HASH_TABLE_SCALE to point out that it is related to scaling the hash table. 2. Rename STACK_HASH_ORDER_MIN/MAX to STACK_BUCKET_NUMBER_ORDER_MIN/MAX to point out that it is related to the number of hash table buckets. 3. Rename stack_hash_order to stack_bucket_number_order for the same reason as #2. No functional changes. Link: https://lkml.kernel.org/r/f166dd6f3cb2378aea78600714393dd568c33ee9.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index de1afe3fb24d..d1ab53197353 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -76,17 +76,17 @@ static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_ST static bool __stack_depot_early_init_passed __initdata; /* Use one hash table bucket per 16 KB of memory. */ -#define STACK_HASH_SCALE 14 +#define STACK_HASH_TABLE_SCALE 14 /* Limit the number of buckets between 4K and 1M. */ -#define STACK_HASH_ORDER_MIN 12 -#define STACK_HASH_ORDER_MAX 20 +#define STACK_BUCKET_NUMBER_ORDER_MIN 12 +#define STACK_BUCKET_NUMBER_ORDER_MAX 20 /* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c /* Hash table of pointers to stored stack traces. */ static struct stack_record **stack_table; /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ -static unsigned int stack_hash_order; +static unsigned int stack_bucket_number_order; /* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; @@ -137,28 +137,28 @@ int __init stack_depot_early_init(void) * in fuzzing scenarios, which leads to a large number of different * stack traces being stored in stack depot. */ - if (kasan_enabled() && !stack_hash_order) - stack_hash_order = STACK_HASH_ORDER_MAX; + if (kasan_enabled() && !stack_bucket_number_order) + stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; /* - * If stack_hash_order is not set, leave entries as 0 to rely on the - * automatic calculations performed by alloc_large_system_hash. + * If stack_bucket_number_order is not set, leave entries as 0 to rely + * on the automatic calculations performed by alloc_large_system_hash. */ - if (stack_hash_order) - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) + entries = 1UL << stack_bucket_number_order; pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, - STACK_HASH_SCALE, + STACK_HASH_TABLE_SCALE, HASH_EARLY | HASH_ZERO, NULL, &stack_hash_mask, - 1UL << STACK_HASH_ORDER_MIN, - 1UL << STACK_HASH_ORDER_MAX); + 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, + 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); if (!stack_table) { pr_err("hash table allocation failed, disabling\n"); stack_depot_disabled = true; @@ -181,13 +181,13 @@ int stack_depot_init(void) goto out_unlock; /* - * Similarly to stack_depot_early_init, use stack_hash_order + * Similarly to stack_depot_early_init, use stack_bucket_number_order * if assigned, and rely on automatic scaling otherwise. */ - if (stack_hash_order) { - entries = 1UL << stack_hash_order; + if (stack_bucket_number_order) { + entries = 1UL << stack_bucket_number_order; } else { - int scale = STACK_HASH_SCALE; + int scale = STACK_HASH_TABLE_SCALE; entries = nr_free_buffer_pages(); entries = roundup_pow_of_two(entries); @@ -198,10 +198,10 @@ int stack_depot_init(void) entries <<= (PAGE_SHIFT - scale); } - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; + if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; + if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); From 961c949b012f009c51ce209ded801e34d0a75306 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:57 +0100 Subject: [PATCH 470/505] lib/stackdepot: rename slab to pool Use "pool" instead of "slab" for naming memory regions stack depot uses to store stack traces. Using "slab" is confusing, as stack depot pools have nothing to do with the slab allocator. Also give better names to pool-related global variables: change "depot_" prefix to "pool_" to point out that these variables are related to stack depot pools. Also rename the slabindex (poolindex) field in handle_parts to pool_index to align its name with the pool_index global variable. No functional changes. Link: https://lkml.kernel.org/r/923c507edb350c3b6ef85860f36be489dfc0ad21.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 106 +++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index d1ab53197353..522e36cf449f 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -39,7 +39,7 @@ #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) #define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ +#define STACK_ALLOC_ORDER 2 /* Pool size order for stack depot, 4 pages */ #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) #define STACK_ALLOC_ALIGN 4 #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ @@ -47,16 +47,16 @@ #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - \ STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_SLABS_CAP 8192 -#define STACK_ALLOC_MAX_SLABS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) +#define STACK_ALLOC_POOLS_CAP 8192 +#define STACK_ALLOC_MAX_POOLS \ + (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_POOLS_CAP) ? \ + (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_POOLS_CAP) /* The compact structure to store the reference to stacks. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 slabindex : STACK_ALLOC_INDEX_BITS; + u32 pool_index : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; u32 extra : STACK_DEPOT_EXTRA_BITS; @@ -91,15 +91,15 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; -/* Currently used slab in stack_slabs. */ -static int depot_index; -/* Offset to the unused space in the currently used slab. */ -static size_t depot_offset; +static void *stack_pools[STACK_ALLOC_MAX_POOLS]; +/* Currently used pool in stack_pools. */ +static int pool_index; +/* Offset to the unused space in the currently used pool. */ +static size_t pool_offset; /* Lock that protects the variables above. */ -static DEFINE_RAW_SPINLOCK(depot_lock); -/* Whether the next slab is initialized. */ -static int next_slab_inited; +static DEFINE_RAW_SPINLOCK(pool_lock); +/* Whether the next pool is initialized. */ +static int next_pool_inited; static int __init disable_stack_depot(char *str) { @@ -220,30 +220,30 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool init_stack_slab(void **prealloc) +static bool init_stack_pool(void **prealloc) { if (!*prealloc) return false; /* * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). + * |next_pool_inited| below and in depot_alloc_stack(). */ - if (smp_load_acquire(&next_slab_inited)) + if (smp_load_acquire(&next_pool_inited)) return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; + if (stack_pools[pool_index] == NULL) { + stack_pools[pool_index] = *prealloc; *prealloc = NULL; } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; + /* If this is the last depot pool, do not touch the next one. */ + if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) { + stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } /* * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). + * |next_pool_inited| above and in stack_depot_save(). */ - smp_store_release(&next_slab_inited, 1); + smp_store_release(&next_pool_inited, 1); } return true; } @@ -257,35 +257,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { + if (unlikely(pool_offset + required_size > STACK_ALLOC_SIZE)) { + if (unlikely(pool_index + 1 >= STACK_ALLOC_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } - depot_index++; - depot_offset = 0; + pool_index++; + pool_offset = 0; /* * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). + * |next_pool_inited| in stack_depot_save() and + * init_stack_pool(). */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); + if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) + smp_store_release(&next_pool_inited, 0); } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) + init_stack_pool(prealloc); + if (stack_pools[pool_index] == NULL) return NULL; - stack = stack_slabs[depot_index] + depot_offset; + stack = stack_pools[pool_index] + pool_offset; stack->hash = hash; stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; + stack->handle.pool_index = pool_index; + stack->handle.offset = pool_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; + pool_offset += required_size; return stack; } @@ -336,10 +336,10 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * @nr_entries: Size of the storage array * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags - * @can_alloc: Allocate stack slabs (increased chance of failure if false) + * @can_alloc: Allocate stack pools (increased chance of failure if false) * * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack slab pool in case no space is left + * %true, is allowed to replenish the stack pool in case no space is left * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids * any allocations and will fail if no space is left to store the stack trace. * @@ -396,14 +396,14 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto exit; /* - * Check if the current or the next stack slab need to be initialized. + * Check if the current or the next stack pool need to be initialized. * If so, allocate the memory - we won't be able to do that under the * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). + * |next_pool_inited| in depot_alloc_stack() and init_stack_pool(). */ - if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { + if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -417,7 +417,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&depot_lock, flags); + raw_spin_lock_irqsave(&pool_lock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { @@ -437,10 +437,10 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!init_stack_slab(&prealloc)); + WARN_ON(!init_stack_pool(&prealloc)); } - raw_spin_unlock_irqrestore(&depot_lock, flags); + raw_spin_unlock_irqrestore(&pool_lock, flags); exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ @@ -488,7 +488,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; - void *slab; + void *pool; size_t offset = parts.offset << STACK_ALLOC_ALIGN; struct stack_record *stack; @@ -496,15 +496,15 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); + if (parts.pool_index > pool_index) { + WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", + parts.pool_index, pool_index, handle); return 0; } - slab = stack_slabs[parts.slabindex]; - if (!slab) + pool = stack_pools[parts.pool_index]; + if (!pool) return 0; - stack = slab + offset; + stack = pool + offset; *entries = stack->entries; return stack->size; From 424cafee4a9c66435d8b86e7edbc794c116b52a5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:58 +0100 Subject: [PATCH 471/505] lib/stackdepot: rename handle and pool constants Change the "STACK_ALLOC_" prefix to "DEPOT_" for the constants that define the number of bits in stack depot handles and the maximum number of pools. The old prefix is unclear and makes wonder about how these constants are related to stack allocations. The new prefix is also shorter. Also simplify the comment for DEPOT_POOL_ORDER. No functional changes. Link: https://lkml.kernel.org/r/84fcceb0acc261a356a0ad4bdfab9ff04bea2445.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 54 +++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 522e36cf449f..97bba462ee13 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,30 +36,28 @@ #include #include -#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) -#define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* Pool size order for stack depot, 4 pages */ -#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) -#define STACK_ALLOC_ALIGN 4 -#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ - STACK_ALLOC_ALIGN) -#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - \ - STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_POOLS_CAP 8192 -#define STACK_ALLOC_MAX_POOLS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_POOLS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_POOLS_CAP) +#define DEPOT_VALID_BITS 1 +#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ +#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ + DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) +#define DEPOT_POOLS_CAP 8192 +#define DEPOT_MAX_POOLS \ + (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ + (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) /* The compact structure to store the reference to stacks. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 pool_index : STACK_ALLOC_INDEX_BITS; - u32 offset : STACK_ALLOC_OFFSET_BITS; - u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 valid : DEPOT_VALID_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -91,7 +89,7 @@ static unsigned int stack_bucket_number_order; static unsigned int stack_hash_mask; /* Array of memory regions that store stack traces. */ -static void *stack_pools[STACK_ALLOC_MAX_POOLS]; +static void *stack_pools[DEPOT_MAX_POOLS]; /* Currently used pool in stack_pools. */ static int pool_index; /* Offset to the unused space in the currently used pool. */ @@ -235,7 +233,7 @@ static bool init_stack_pool(void **prealloc) *prealloc = NULL; } else { /* If this is the last depot pool, do not touch the next one. */ - if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) { + if (pool_index + 1 < DEPOT_MAX_POOLS) { stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } @@ -255,10 +253,10 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); + required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); - if (unlikely(pool_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(pool_index + 1 >= STACK_ALLOC_MAX_POOLS)) { + if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { + if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } @@ -269,7 +267,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) * |next_pool_inited| in stack_depot_save() and * init_stack_pool(). */ - if (pool_index + 1 < STACK_ALLOC_MAX_POOLS) + if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } init_stack_pool(prealloc); @@ -281,7 +279,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->hash = hash; stack->size = size; stack->handle.pool_index = pool_index; - stack->handle.offset = pool_offset >> STACK_ALLOC_ALIGN; + stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.valid = 1; stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); @@ -412,7 +410,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); alloc_flags |= __GFP_NOWARN; - page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); + page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); if (page) prealloc = page_address(page); } @@ -444,7 +442,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ - free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); + free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) retval.handle = found->handle.handle; @@ -489,7 +487,7 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, { union handle_parts parts = { .handle = handle }; void *pool; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; + size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; *entries = NULL; From cb788e84a4cfe47941cded45b5ca81a917fbb1a6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:15:59 +0100 Subject: [PATCH 472/505] lib/stackdepot: rename init_stack_pool Rename init_stack_pool to depot_init_pool to align the name with depot_alloc_stack. No functional changes. Link: https://lkml.kernel.org/r/23106a3e291d8df0aba33c0e2fe86dc596286479.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 97bba462ee13..7f5f08bb6c3a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,7 +218,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool init_stack_pool(void **prealloc) +static bool depot_init_pool(void **prealloc) { if (!*prealloc) return false; @@ -265,12 +265,12 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) /* * smp_store_release() here pairs with smp_load_acquire() from * |next_pool_inited| in stack_depot_save() and - * init_stack_pool(). + * depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } - init_stack_pool(prealloc); + depot_init_pool(prealloc); if (stack_pools[pool_index] == NULL) return NULL; @@ -399,7 +399,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_pool_inited| in depot_alloc_stack() and init_stack_pool(). + * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). */ if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { /* @@ -435,7 +435,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!init_stack_pool(&prealloc)); + WARN_ON(!depot_init_pool(&prealloc)); } raw_spin_unlock_irqrestore(&pool_lock, flags); From 514d5c557b8b590a80f0569af5ae5f4d455ecef2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:00 +0100 Subject: [PATCH 473/505] lib/stacktrace: drop impossible WARN_ON for depot_init_pool depot_init_pool has two call sites: 1. In depot_alloc_stack with a potentially NULL prealloc. 2. In __stack_depot_save with a non-NULL prealloc. At the same time depot_init_pool can only return false when prealloc is NULL. As the second call site makes sure that prealloc is not NULL, the WARN_ON there can never trigger. Thus, drop the WARN_ON and also move the prealloc check from depot_init_pool to its first call site. Also change the return type of depot_init_pool to void as it now always returns true. Link: https://lkml.kernel.org/r/ce149f9bdcbc80a92549b54da67eafb27f846b7b.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 7f5f08bb6c3a..d4d988276b91 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,16 +218,14 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); -static bool depot_init_pool(void **prealloc) +static void depot_init_pool(void **prealloc) { - if (!*prealloc) - return false; /* * This smp_load_acquire() pairs with smp_store_release() to * |next_pool_inited| below and in depot_alloc_stack(). */ if (smp_load_acquire(&next_pool_inited)) - return true; + return; if (stack_pools[pool_index] == NULL) { stack_pools[pool_index] = *prealloc; *prealloc = NULL; @@ -243,7 +241,6 @@ static bool depot_init_pool(void **prealloc) */ smp_store_release(&next_pool_inited, 1); } - return true; } /* Allocation of a new stack in raw storage */ @@ -270,7 +267,8 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } - depot_init_pool(prealloc); + if (*prealloc) + depot_init_pool(prealloc); if (stack_pools[pool_index] == NULL) return NULL; @@ -435,7 +433,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * We didn't need to store this stack trace, but let's keep * the preallocated memory for the future. */ - WARN_ON(!depot_init_pool(&prealloc)); + depot_init_pool(&prealloc); } raw_spin_unlock_irqrestore(&pool_lock, flags); From cd0fc64e76844758b78d0fd376ae3ca4fd802049 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:01 +0100 Subject: [PATCH 474/505] lib/stackdepot: annotate depot_init_pool and depot_alloc_stack Clean up the exisiting comments and add new ones to depot_init_pool and depot_alloc_stack. As a part of the clean-up, remove mentions of which variable is accessed by smp_store_release and smp_load_acquire: it is clear as is from the code. Link: https://lkml.kernel.org/r/f80b02951364e6b40deda965b4003de0cd1a532d.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index d4d988276b91..c4bc198c3d93 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -218,32 +218,39 @@ out_unlock: } EXPORT_SYMBOL_GPL(stack_depot_init); +/* Uses preallocated memory to initialize a new stack depot pool. */ static void depot_init_pool(void **prealloc) { /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_pool_inited| below and in depot_alloc_stack(). + * smp_load_acquire() here pairs with smp_store_release() below and + * in depot_alloc_stack(). */ if (smp_load_acquire(&next_pool_inited)) return; + + /* Check if the current pool is not yet allocated. */ if (stack_pools[pool_index] == NULL) { + /* Use the preallocated memory for the current pool. */ stack_pools[pool_index] = *prealloc; *prealloc = NULL; } else { - /* If this is the last depot pool, do not touch the next one. */ + /* + * Otherwise, use the preallocated memory for the next pool + * as long as we do not exceed the maximum number of pools. + */ if (pool_index + 1 < DEPOT_MAX_POOLS) { stack_pools[pool_index + 1] = *prealloc; *prealloc = NULL; } /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_pool_inited| above and in stack_depot_save(). + * This smp_store_release pairs with smp_load_acquire() above + * and in stack_depot_save(). */ smp_store_release(&next_pool_inited, 1); } } -/* Allocation of a new stack in raw storage */ +/* Allocates a new stack in a stack depot pool. */ static struct stack_record * depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { @@ -252,28 +259,35 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); + /* Check if there is not enough space in the current pool. */ if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { + /* Bail out if we reached the pool limit. */ if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { WARN_ONCE(1, "Stack depot reached limit capacity"); return NULL; } + + /* Move on to the next pool. */ pool_index++; pool_offset = 0; /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_pool_inited| in stack_depot_save() and - * depot_init_pool(). + * smp_store_release() here pairs with smp_load_acquire() in + * stack_depot_save() and depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) smp_store_release(&next_pool_inited, 0); } + + /* Assign the preallocated memory to a pool if required. */ if (*prealloc) depot_init_pool(prealloc); + + /* Check if we have a pool to save the stack trace. */ if (stack_pools[pool_index] == NULL) return NULL; + /* Save the stack trace. */ stack = stack_pools[pool_index] + pool_offset; - stack->hash = hash; stack->size = size; stack->handle.pool_index = pool_index; From d11a5621f3252120dfc7cef7600a90bd8e605caf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:02 +0100 Subject: [PATCH 475/505] lib/stackdepot: rename next_pool_inited to next_pool_required Stack depot uses next_pool_inited to mark that either the next pool is initialized or the limit on the number of pools is reached. However, the flag name only reflects the former part of its purpose, which is confusing. Rename next_pool_inited to next_pool_required and invert its value. Also annotate usages of next_pool_required with comments. Link: https://lkml.kernel.org/r/484fd2695dff7a9bdc437a32f8a6ee228535aa02.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index c4bc198c3d93..4df162a84bfe 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -96,8 +96,14 @@ static int pool_index; static size_t pool_offset; /* Lock that protects the variables above. */ static DEFINE_RAW_SPINLOCK(pool_lock); -/* Whether the next pool is initialized. */ -static int next_pool_inited; +/* + * Stack depot tries to keep an extra pool allocated even before it runs out + * of space in the currently used pool. + * This flag marks that this next extra pool needs to be allocated and + * initialized. It has the value 0 when either the next pool is not yet + * initialized or the limit on the number of pools is reached. + */ +static int next_pool_required = 1; static int __init disable_stack_depot(char *str) { @@ -222,10 +228,12 @@ EXPORT_SYMBOL_GPL(stack_depot_init); static void depot_init_pool(void **prealloc) { /* + * If the next pool is already initialized or the maximum number of + * pools is reached, do not use the preallocated memory. * smp_load_acquire() here pairs with smp_store_release() below and * in depot_alloc_stack(). */ - if (smp_load_acquire(&next_pool_inited)) + if (!smp_load_acquire(&next_pool_required)) return; /* Check if the current pool is not yet allocated. */ @@ -243,10 +251,13 @@ static void depot_init_pool(void **prealloc) *prealloc = NULL; } /* + * At this point, either the next pool is initialized or the + * maximum number of pools is reached. In either case, take + * note that initializing another pool is not required. * This smp_store_release pairs with smp_load_acquire() above * and in stack_depot_save(). */ - smp_store_release(&next_pool_inited, 1); + smp_store_release(&next_pool_required, 0); } } @@ -271,11 +282,13 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) pool_index++; pool_offset = 0; /* + * If the maximum number of pools is not reached, take note + * that the next pool needs to initialized. * smp_store_release() here pairs with smp_load_acquire() in * stack_depot_save() and depot_init_pool(). */ if (pool_index + 1 < DEPOT_MAX_POOLS) - smp_store_release(&next_pool_inited, 0); + smp_store_release(&next_pool_required, 1); } /* Assign the preallocated memory to a pool if required. */ @@ -406,14 +419,13 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, goto exit; /* - * Check if the current or the next stack pool need to be initialized. - * If so, allocate the memory - we won't be able to do that under the - * lock. + * Check if another stack pool needs to be initialized. If so, allocate + * the memory now - we won't be able to do that under the lock. * * The smp_load_acquire() here pairs with smp_store_release() to * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). */ - if (unlikely(can_alloc && !smp_load_acquire(&next_pool_inited))) { + if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic From 36aa1e6779c3c6f8e0d4552544214f5cffe3c287 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:03 +0100 Subject: [PATCH 476/505] lib/stacktrace, kasan, kmsan: rework extra_bits interface The current implementation of the extra_bits interface is confusing: passing extra_bits to __stack_depot_save makes it seem that the extra bits are somehow stored in stack depot. In reality, they are only embedded into a stack depot handle and are not used within stack depot. Drop the extra_bits argument from __stack_depot_save and instead provide a new stack_depot_set_extra_bits function (similar to the exsiting stack_depot_get_extra_bits) that saves extra bits into a stack depot handle. Update the callers of __stack_depot_save to use the new interace. This change also fixes a minor issue in the old code: __stack_depot_save does not return NULL if saving stack trace fails and extra_bits is used. Link: https://lkml.kernel.org/r/317123b5c05e2f82854fc55d8b285e0869d3cb77.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 4 +++- lib/stackdepot.c | 42 ++++++++++++++++++++++++++++++-------- mm/kasan/common.c | 2 +- mm/kmsan/core.c | 10 ++++++--- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c4e3abc16b16..267f4b2634ee 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -57,7 +57,6 @@ static inline int stack_depot_early_init(void) { return 0; } depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); depot_stack_handle_t stack_depot_save(unsigned long *entries, @@ -71,6 +70,9 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +depot_stack_handle_t __must_check stack_depot_set_extra_bits( + depot_stack_handle_t handle, unsigned int extra_bits); + unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4df162a84bfe..8c6e4e9cb535 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -357,7 +357,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * * @entries: Pointer to storage array * @nr_entries: Size of the storage array - * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack pools (increased chance of failure if false) * @@ -369,10 +368,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * - * Additional opaque flags can be passed in @extra_bits, stored in the unused - * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() - * without calling stack_depot_fetch(). - * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -382,7 +377,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; @@ -471,8 +465,6 @@ exit: if (found) retval.handle = found->handle.handle; fast_exit: - retval.extra = extra_bits; - return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -493,7 +485,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); @@ -576,6 +568,38 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle returned from stack_depot_save() + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ +depot_stack_handle_t __must_check stack_depot_set_extra_bits( + depot_stack_handle_t handle, unsigned int extra_bits) +{ + union handle_parts parts = { .handle = handle }; + + /* Don't set extra bits on empty handles. */ + if (!handle) + return 0; + + parts.extra = extra_bits; + return parts.handle; +} +EXPORT_SYMBOL(stack_depot_set_extra_bits); + +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1e7336ae3786..b376a5d055e5 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 112dce135c7f..f710257d6867 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -69,13 +69,15 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, { unsigned long entries[KMSAN_STACK_DEPTH]; unsigned int nr_entries; + depot_stack_handle_t handle; nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ flags &= ~__GFP_DIRECT_RECLAIM; - return __stack_depot_save(entries, nr_entries, extra, flags, true); + handle = __stack_depot_save(entries, nr_entries, flags, true); + return stack_depot_set_extra_bits(handle, extra); } /* Copy the metadata following the memmove() behavior. */ @@ -215,6 +217,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) u32 extra_bits; int depth; bool uaf; + depot_stack_handle_t handle; if (!id) return id; @@ -250,8 +253,9 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * positives when __stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, - GFP_ATOMIC, true); + handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC, + true); + return stack_depot_set_extra_bits(handle, extra_bits); } void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, From beb3c23c69a91a10f247e93ffef1fcd0209d93e4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:04 +0100 Subject: [PATCH 477/505] lib/stackdepot: annotate racy pool_index accesses Accesses to pool_index are protected by pool_lock everywhere except in a sanity check in stack_depot_fetch. The read access there can race with the write access in depot_alloc_stack. Use WRITE/READ_ONCE() to annotate the racy accesses. As the sanity check is only used to print a warning in case of a violation of the stack depot interface usage, it does not make a lot of sense to use proper synchronization. [andreyknvl@google.com: s/pool_index/pool_index_cached/ in stack_depot_fetch()] Link: https://lkml.kernel.org/r/95cf53f0da2c112aa2cc54456cbcd6975c3ff343.1676129911.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/359ac9c13cd0869c56740fb2029f505e41593830.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8c6e4e9cb535..b625090890e1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -278,8 +278,12 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return NULL; } - /* Move on to the next pool. */ - pool_index++; + /* + * Move on to the next pool. + * WRITE_ONCE pairs with potential concurrent read in + * stack_depot_fetch(). + */ + WRITE_ONCE(pool_index, pool_index + 1); pool_offset = 0; /* * If the maximum number of pools is not reached, take note @@ -502,6 +506,11 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { union handle_parts parts = { .handle = handle }; + /* + * READ_ONCE pairs with potential concurrent write in + * depot_alloc_stack. + */ + int pool_index_cached = READ_ONCE(pool_index); void *pool; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; @@ -510,9 +519,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, if (!handle) return 0; - if (parts.pool_index > pool_index) { + if (parts.pool_index > pool_index_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pool_index, handle); + parts.pool_index, pool_index_cached, handle); return 0; } pool = stack_pools[parts.pool_index]; From b232b9995a6dbaafe19d07d81acc039bc84bd569 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:05 +0100 Subject: [PATCH 478/505] lib/stackdepot: various comments clean-ups Clean up comments in include/linux/stackdepot.h and lib/stackdepot.c: 1. Rework the initialization comment in stackdepot.h. 2. Rework the header comment in stackdepot.c. 3. Various clean-ups for other comments. Also adjust whitespaces for find_stack and depot_alloc_stack call sites. No functional changes. Link: https://lkml.kernel.org/r/5836231b7954355e2311fc9b5870f697ea8e1f7d.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++------ lib/stackdepot.c | 120 ++++++++++++++++++------------------- 2 files changed, 78 insertions(+), 78 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 267f4b2634ee..afdf8ee7b597 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * A generic stack depot implementation + * Stack depot - a stack trace storage that avoids duplication. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #ifndef _LINUX_STACKDEPOT_H @@ -17,35 +17,37 @@ typedef u32 depot_stack_handle_t; /* * Number of bits in the handle that stack depot doesn't use. Users may store - * information in them. + * information in them via stack_depot_set/get_extra_bits. */ #define STACK_DEPOT_EXTRA_BITS 5 /* - * Every user of stack depot has to call stack_depot_init() during its own init - * when it's decided that it will be calling stack_depot_save() later. This is - * recommended for e.g. modules initialized later in the boot process, when - * slab_is_available() is true. + * Using stack depot requires its initialization, which can be done in 3 ways: * - * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot - * enabled as part of mm_init(), for subsystems where it's known at compile time - * that stack depot will be used. + * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in + * scenarios where it's known at compile time that stack depot will be used. + * Enabling this config makes the kernel initialize stack depot in mm_init(). * - * Another alternative is to call stack_depot_request_early_init(), when the - * decision to use stack depot is taken e.g. when evaluating kernel boot - * parameters, which precedes the enablement point in mm_init(). + * 2. Calling stack_depot_request_early_init() during early boot, before + * stack_depot_early_init() in mm_init() completes. For example, this can + * be done when evaluating kernel boot parameters. + * + * 3. Calling stack_depot_init(). Possible after boot is complete. This option + * is recommended for modules initialized later in the boot process, after + * mm_init() completes. * * stack_depot_init() and stack_depot_request_early_init() can be called - * regardless of CONFIG_STACKDEPOT and are no-op when disabled. The actual - * save/fetch/print functions should only be called from code that makes sure - * CONFIG_STACKDEPOT is enabled. + * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this + * config is disabled. The save/fetch/print stack depot functions can only be + * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_ + * initializes stack depot via one of the ways listed above. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); void __init stack_depot_request_early_init(void); -/* This is supposed to be called only from mm_init() */ +/* Must be only called from mm_init(). */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b625090890e1..aaa5d2f1abc2 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -1,22 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Generic stack depot for storing stack traces. + * Stack depot - a stack trace storage that avoids duplication. * - * Some debugging tools need to save stack traces of certain events which can - * be later presented to the user. For example, KASAN needs to safe alloc and - * free stacks for each object, but storing two stack traces per object - * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for - * that). + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. * - * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc - * and free stacks repeat a lot, we save about 100x space. - * Stacks are never removed from depot, so we store them contiguously one after - * another in a contiguous memory allocation. + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Internally, stack depot maintains a hash table of unique stacktraces. The + * stack traces themselves are stored contiguously one after another in a set + * of separate page allocations. + * + * Stack traces are never removed from stack depot. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #define pr_fmt(fmt) "stackdepot: " fmt @@ -50,7 +54,7 @@ (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) -/* The compact structure to store the reference to stacks. */ +/* Compact structure that stores a reference to a stack. */ union handle_parts { depot_stack_handle_t handle; struct { @@ -62,11 +66,11 @@ union handle_parts { }; struct stack_record { - struct stack_record *next; /* Link in the hashtable */ - u32 hash; /* Hash in the hastable */ - u32 size; /* Number of frames in the stack */ + struct stack_record *next; /* Link in the hash table */ + u32 hash; /* Hash in the hash table */ + u32 size; /* Number of stored frames */ union handle_parts handle; - unsigned long entries[]; /* Variable-sized array of entries. */ + unsigned long entries[]; /* Variable-sized array of frames */ }; static bool stack_depot_disabled; @@ -317,7 +321,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) return stack; } -/* Calculate hash for a stack */ +/* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { return jhash2((u32 *)entries, @@ -325,9 +329,9 @@ static inline u32 hash_stack(unsigned long *entries, unsigned int size) STACK_HASH_SEED); } -/* Use our own, non-instrumented version of memcmp(). - * - * We actually don't care about the order, just the equality. +/* + * Non-instrumented version of memcmp(). + * Does not check the lexicographical order, only the equality. */ static inline int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, @@ -340,7 +344,7 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, return 0; } -/* Find a stack that is equal to the one stored in entries in the hash */ +/* Finds a stack in a bucket of the hash table. */ static inline struct stack_record *find_stack(struct stack_record *bucket, unsigned long *entries, int size, u32 hash) @@ -357,27 +361,27 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, } /** - * __stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * @can_alloc: Allocate stack pools (increased chance of failure if false) * * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack pool in case no space is left + * %true, stack depot can replenish the stack pools in case no space is left * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and will fail if no space is left to store the stack trace. + * any allocations and fails if no space is left to store the stack trace. * - * If the stack trace in @entries is from an interrupt, only the portion up to - * interrupt entry is saved. + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently - * this is the case from contexts where neither %GFP_ATOMIC nor + * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack struct stored in depot, 0 on failure */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -392,11 +396,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, /* * If this stack trace is from an interrupt, including anything before - * interrupt entry usually leads to unbounded stackdepot growth. + * interrupt entry usually leads to unbounded stack depot growth. * - * Because use of filter_irq_stacks() is a requirement to ensure - * stackdepot can efficiently deduplicate interrupt stacks, always - * filter_irq_stacks() to simplify all callers' use of stackdepot. + * Since use of filter_irq_stacks() is a requirement to ensure stack + * depot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stack depot. */ nr_entries = filter_irq_stacks(entries, nr_entries); @@ -411,8 +415,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |bucket| below. */ - found = find_stack(smp_load_acquire(bucket), entries, - nr_entries, hash); + found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); if (found) goto exit; @@ -441,7 +444,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + struct stack_record *new = + depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { new->next = *bucket; @@ -454,8 +458,8 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } } else if (prealloc) { /* - * We didn't need to store this stack trace, but let's keep - * the preallocated memory for the future. + * Stack depot already contains this stack trace, but let's + * keep the preallocated memory for the future. */ depot_init_pool(&prealloc); } @@ -463,7 +467,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, raw_spin_unlock_irqrestore(&pool_lock, flags); exit: if (prealloc) { - /* Nobody used this memory, ok to free it. */ + /* Stack depot didn't use this memory, free it. */ free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) @@ -474,16 +478,16 @@ fast_exit: EXPORT_SYMBOL_GPL(__stack_depot_save); /** - * stack_depot_save - Save a stack trace from an array + * stack_depot_save - Save a stack trace to stack depot * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. * See __stack_depot_save() for more details. * - * Return: The handle of the stack struct stored in depot, 0 on failure. + * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, @@ -494,13 +498,12 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, EXPORT_SYMBOL_GPL(stack_depot_save); /** - * stack_depot_fetch - Fetch stack entries from a depot + * stack_depot_fetch - Fetch a stack trace from stack depot * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace * - * Return: The number of trace entries for this depot. + * Return: Number of frames for the fetched stack */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) @@ -535,11 +538,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). + * stack_depot_print - Print a stack trace from stack depot * + * @stack: Stack depot handle returned from stack_depot_save() */ void stack_depot_print(depot_stack_handle_t stack) { @@ -553,17 +554,14 @@ void stack_depot_print(depot_stack_handle_t stack) EXPORT_SYMBOL_GPL(stack_depot_print); /** - * stack_depot_snprint - print stack entries from a depot into a buffer + * stack_depot_snprint - Print a stack trace from stack depot into a buffer * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). + * @handle: Stack depot handle returned from stack_depot_save() * @buf: Pointer to the print buffer - * * @size: Size of the print buffer - * * @spaces: Number of leading spaces to print * - * Return: Number of bytes printed. + * Return: Number of bytes printed */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) From 0621d160f1003a8aedd3628133568ecffdd724f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 10 Feb 2023 22:16:06 +0100 Subject: [PATCH 479/505] lib/stackdepot: move documentation comments to stackdepot.h Move all interface- and usage-related documentation comments to include/linux/stackdepot.h. It makes sense to have them in the header where they are available to the interface users. [akpm@linux-foundation.org: grammar fix, per Alexander] Link: https://lkml.kernel.org/r/fbfee41495b306dd8881f9b1c1b80999c885e82f.1676063693.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 87 ++++++++++++++++++++++++++++++++++++++ lib/stackdepot.c | 87 -------------------------------------- 2 files changed, 87 insertions(+), 87 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index afdf8ee7b597..e58306783d8e 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -2,6 +2,17 @@ /* * Stack depot - a stack trace storage that avoids duplication. * + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. + * + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Stack traces are never removed from the stack depot. + * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -57,24 +68,100 @@ static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +/** + * __stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * @can_alloc: Allocate stack pools (increased chance of failure if false) + * + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, stack depot can replenish the stack pools in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and fails if no space is left to store the stack trace. + * + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case for contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: Handle of the stack struct stored in depot, 0 on failure + */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/** + * stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: Handle of the stack trace stored in depot, 0 on failure + */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * stack_depot_fetch - Fetch a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace + * + * Return: Number of frames for the fetched stack + */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +/** + * stack_depot_print - Print a stack trace from stack depot + * + * @stack: Stack depot handle returned from stack_depot_save() + */ void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_snprint - Print a stack trace from stack depot into a buffer + * + * @handle: Stack depot handle returned from stack_depot_save() + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed + */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle returned from stack_depot_save() + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ depot_stack_handle_t __must_check stack_depot_set_extra_bits( depot_stack_handle_t handle, unsigned int extra_bits); +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index aaa5d2f1abc2..036da8e295d1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -2,21 +2,10 @@ /* * Stack depot - a stack trace storage that avoids duplication. * - * Stack depot is intended to be used by subsystems that need to store and - * later retrieve many potentially duplicated stack traces without wasting - * memory. - * - * For example, KASAN needs to save allocation and free stack traces for each - * object. Storing two stack traces per object requires a lot of memory (e.g. - * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free - * stack traces often repeat, using stack depot allows to save about 100x space. - * * Internally, stack depot maintains a hash table of unique stacktraces. The * stack traces themselves are stored contiguously one after another in a set * of separate page allocations. * - * Stack traces are never removed from stack depot. - * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * @@ -360,29 +349,6 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * __stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) - * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. - * - * If the provided stack trace comes from the interrupt context, only the part - * up to the interrupt entry is saved. - * - * Context: Any context, but setting @can_alloc to %false is required if - * alloc_pages() cannot be used from the current context. Currently - * this is the case for contexts where neither %GFP_ATOMIC nor - * %GFP_NOWAIT can be used (NMI, raw_spin_lock). - * - * Return: Handle of the stack struct stored in depot, 0 on failure - */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags, bool can_alloc) @@ -477,18 +443,6 @@ fast_exit: } EXPORT_SYMBOL_GPL(__stack_depot_save); -/** - * stack_depot_save - Save a stack trace to stack depot - * - * @entries: Pointer to the stack trace - * @nr_entries: Number of frames in the stack - * @alloc_flags: Allocation GFP flags - * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. - * - * Return: Handle of the stack trace stored in depot, 0 on failure - */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) @@ -497,14 +451,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_depot_save); -/** - * stack_depot_fetch - Fetch a stack trace from stack depot - * - * @handle: Stack depot handle returned from stack_depot_save() - * @entries: Pointer to store the address of the stack trace - * - * Return: Number of frames for the fetched stack - */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { @@ -537,11 +483,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, } EXPORT_SYMBOL_GPL(stack_depot_fetch); -/** - * stack_depot_print - Print a stack trace from stack depot - * - * @stack: Stack depot handle returned from stack_depot_save() - */ void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; @@ -553,16 +494,6 @@ void stack_depot_print(depot_stack_handle_t stack) } EXPORT_SYMBOL_GPL(stack_depot_print); -/** - * stack_depot_snprint - Print a stack trace from stack depot into a buffer - * - * @handle: Stack depot handle returned from stack_depot_save() - * @buf: Pointer to the print buffer - * @size: Size of the print buffer - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed - */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces) { @@ -575,17 +506,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, } EXPORT_SYMBOL_GPL(stack_depot_snprint); -/** - * stack_depot_set_extra_bits - Set extra bits in a stack depot handle - * - * @handle: Stack depot handle returned from stack_depot_save() - * @extra_bits: Value to set the extra bits - * - * Return: Stack depot handle with extra bits set - * - * Stack depot handles have a few unused bits, which can be used for storing - * user-specific information. These bits are transparent to the stack depot. - */ depot_stack_handle_t __must_check stack_depot_set_extra_bits( depot_stack_handle_t handle, unsigned int extra_bits) { @@ -600,13 +520,6 @@ depot_stack_handle_t __must_check stack_depot_set_extra_bits( } EXPORT_SYMBOL(stack_depot_set_extra_bits); -/** - * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle - * - * @handle: Stack depot handle with extra bits saved - * - * Return: Extra bits retrieved from the stack depot handle - */ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) { union handle_parts parts = { .handle = handle }; From 5b855937096aea7f81e73ad6d40d433c9dd49577 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:36 +0800 Subject: [PATCH 480/505] migrate_pages: organize stats with struct migrate_pages_stats Patch series "migrate_pages(): batch TLB flushing", v5. Now, migrate_pages() migrates folios one by one, like the fake code as follows, for each folio unmap flush TLB copy restore map If multiple folios are passed to migrate_pages(), there are opportunities to batch the TLB flushing and copying. That is, we can change the code to something as follows, for each folio unmap for each folio flush TLB for each folio copy for each folio restore map The total number of TLB flushing IPI can be reduced considerably. And we may use some hardware accelerator such as DSA to accelerate the folio copying. So in this patch, we refactor the migrate_pages() implementation and implement the TLB flushing batching. Base on this, hardware accelerated folio copying can be implemented. If too many folios are passed to migrate_pages(), in the naive batched implementation, we may unmap too many folios at the same time. The possibility for a task to wait for the migrated folios to be mapped again increases. So the latency may be hurt. To deal with this issue, the max number of folios be unmapped in batch is restricted to no more than HPAGE_PMD_NR in the unit of page. That is, the influence is at the same level of THP migration. We use the following test to measure the performance impact of the patchset, On a 2-socket Intel server, - Run pmbench memory accessing benchmark - Run `migratepages` to migrate pages of pmbench between node 0 and node 1 back and forth. With the patch, the TLB flushing IPI reduces 99.1% during the test and the number of pages migrated successfully per second increases 291.7%. Xin Hao helped to test the patchset on an ARM64 server with 128 cores, 2 NUMA nodes. Test results show that the page migration performance increases up to 78%. This patch (of 9): Define struct migrate_pages_stats to organize the various statistics in migrate_pages(). This makes it easier to collect and consume the statistics in multiple functions. This will be needed in the following patches in the series. Link: https://lkml.kernel.org/r/20230213123444.155149-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230213123444.155149-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Alistair Popple Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 60 +++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 5b40b9040ba6..1a9cfcf857d2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,16 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +struct migrate_pages_stats { + int nr_succeeded; /* Normal and large folios migrated successfully, in + units of base pages */ + int nr_failed_pages; /* Normal and large folios failed to be migrated, in + units of base pages. Untried folios aren't counted */ + int nr_thp_succeeded; /* THP migrated successfully */ + int nr_thp_failed; /* THP failed to be migrated */ + int nr_thp_split; /* THP split before migrating */ +}; + /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration @@ -1448,13 +1458,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int large_retry = 1; int thp_retry = 1; int nr_failed = 0; - int nr_failed_pages = 0; int nr_retry_pages = 0; - int nr_succeeded = 0; - int nr_thp_succeeded = 0; int nr_large_failed = 0; - int nr_thp_failed = 0; - int nr_thp_split = 0; int pass = 0; bool is_large = false; bool is_thp = false; @@ -1464,9 +1469,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); bool no_split_folio_counting = false; + struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); + memset(&stats, 0, sizeof(stats)); split_folio_migration: for (pass = 0; pass < 10 && (retry || large_retry); pass++) { retry = 0; @@ -1520,9 +1527,9 @@ split_folio_migration: /* Large folio migration is unsupported */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; if (!try_split_folio(folio, &split_folios)) { - nr_thp_split += is_thp; + stats.nr_thp_split += is_thp; break; } /* Hugetlb migration is unsupported */ @@ -1530,7 +1537,7 @@ split_folio_migration: nr_failed++; } - nr_failed_pages += nr_pages; + stats.nr_failed_pages += nr_pages; list_move_tail(&folio->lru, &ret_folios); break; case -ENOMEM: @@ -1540,13 +1547,13 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { int ret = try_split_folio(folio, &split_folios); if (!ret) { - nr_thp_split += is_thp; + stats.nr_thp_split += is_thp; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1564,7 +1571,7 @@ split_folio_migration: nr_failed++; } - nr_failed_pages += nr_pages + nr_retry_pages; + stats.nr_failed_pages += nr_pages + nr_retry_pages; /* * There might be some split folios of fail-to-migrate large * folios left in split_folios list. Move them back to migration @@ -1574,7 +1581,7 @@ split_folio_migration: list_splice_init(&split_folios, from); /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; - nr_thp_failed += thp_retry; + stats.nr_thp_failed += thp_retry; goto out; case -EAGAIN: if (is_large) { @@ -1586,8 +1593,8 @@ split_folio_migration: nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded += nr_pages; - nr_thp_succeeded += is_thp; + stats.nr_succeeded += nr_pages; + stats.nr_thp_succeeded += is_thp; break; default: /* @@ -1598,20 +1605,20 @@ split_folio_migration: */ if (is_large) { nr_large_failed++; - nr_thp_failed += is_thp; + stats.nr_thp_failed += is_thp; } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_pages; + stats.nr_failed_pages += nr_pages; break; } } } nr_failed += retry; nr_large_failed += large_retry; - nr_thp_failed += thp_retry; - nr_failed_pages += nr_retry_pages; + stats.nr_thp_failed += thp_retry; + stats.nr_failed_pages += nr_retry_pages; /* * Try to migrate split folios of fail-to-migrate large folios, no * nr_failed counting in this round, since all split folios of a @@ -1644,16 +1651,17 @@ out: if (list_empty(from)) rc = 0; - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); - count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); - count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); - count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, - nr_thp_failed, nr_thp_split, mode, reason); + count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); + count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); + count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); + count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); + trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, + stats.nr_thp_succeeded, stats.nr_thp_failed, + stats.nr_thp_split, mode, reason); if (ret_succeeded) - *ret_succeeded = nr_succeeded; + *ret_succeeded = stats.nr_succeeded; return rc; } From e5bfff8b10e496378da4b7863479dd6fb907d4ea Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:37 +0800 Subject: [PATCH 481/505] migrate_pages: separate hugetlb folios migration This is a preparation patch to batch the folio unmapping and moving for the non-hugetlb folios. Based on that we can batch the TLB shootdown during the folio migration and make it possible to use some hardware accelerator for the folio copying. In this patch the hugetlb folios and non-hugetlb folios migration is separated in migrate_pages() to make it easy to change the non-hugetlb folios migration implementation. Link: https://lkml.kernel.org/r/20230213123444.155149-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 141 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 22 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1a9cfcf857d2..586a32bdaa71 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,8 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +#define NR_MAX_MIGRATE_PAGES_RETRY 10 + struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in units of base pages */ @@ -1424,6 +1426,95 @@ struct migrate_pages_stats { int nr_thp_split; /* THP split before migrating */ }; +/* + * Returns the number of hugetlb folios that were not migrated, or an error code + * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable + * any more because the list has become empty or no retryable hugetlb folios + * exist any more. It is caller's responsibility to call putback_movable_pages() + * only if ret != 0. + */ +static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, + struct migrate_pages_stats *stats, + struct list_head *ret_folios) +{ + int retry = 1; + int nr_failed = 0; + int nr_retry_pages = 0; + int pass = 0; + struct folio *folio, *folio2; + int rc, nr_pages; + + for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { + retry = 0; + nr_retry_pages = 0; + + list_for_each_entry_safe(folio, folio2, from, lru) { + if (!folio_test_hugetlb(folio)) + continue; + + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + put_new_page, private, + &folio->page, pass > 2, mode, + reason, ret_folios); + /* + * The rules are: + * Success: hugetlb folio will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list + * Other errno: put on ret_folios list + */ + switch(rc) { + case -ENOSYS: + /* Hugetlb migration is unsupported */ + nr_failed++; + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + break; + case -ENOMEM: + /* + * When memory is low, don't bother to try to migrate + * other folios, just exit. + */ + stats->nr_failed_pages += nr_pages + nr_retry_pages; + return -ENOMEM; + case -EAGAIN: + retry++; + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + break; + default: + /* + * Permanent failure (-EBUSY, etc.): + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not + * retried in the next outer loop. + */ + nr_failed++; + stats->nr_failed_pages += nr_pages; + break; + } + } + } + /* + * nr_failed is number of hugetlb folios failed to be migrated. After + * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb + * folios as failed. + */ + nr_failed += retry; + stats->nr_failed_pages += nr_retry_pages; + + return nr_failed; +} + /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration @@ -1440,10 +1531,10 @@ struct migrate_pages_stats { * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * - * The function returns after 10 attempts or if no folios are movable any more - * because the list has become empty or no retryable folios exist any more. - * It is caller's responsibility to call putback_movable_pages() to return folios - * to the LRU or free list only if ret != 0. + * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios + * are movable any more because the list has become empty or no retryable folios + * exist any more. It is caller's responsibility to call putback_movable_pages() + * only if ret != 0. * * Returns the number of {normal folio, large folio, hugetlb} that were not * migrated, or an error code. The number of large folio splits will be @@ -1457,7 +1548,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int large_retry = 1; int thp_retry = 1; - int nr_failed = 0; + int nr_failed; int nr_retry_pages = 0; int nr_large_failed = 0; int pass = 0; @@ -1474,38 +1565,45 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); + rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, + &stats, &ret_folios); + if (rc < 0) + goto out; + nr_failed = rc; + split_folio_migration: - for (pass = 0; pass < 10 && (retry || large_retry); pass++) { + for (pass = 0; + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); + pass++) { retry = 0; large_retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { + /* Retried hugetlb folios will be kept in list */ + if (folio_test_hugetlb(folio)) { + list_move_tail(&folio->lru, &ret_folios); + continue; + } + /* * Large folio statistics is based on the source large * folio. Capture required information that might get * lost during migration. */ - is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); + is_large = folio_test_large(folio); is_thp = is_large && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); + cond_resched(); - if (folio_test_hugetlb(folio)) - rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, - &folio->page, pass > 2, mode, - reason, - &ret_folios); - else - rc = unmap_and_move(get_new_page, put_new_page, - private, folio, pass > 2, mode, - reason, &ret_folios); + rc = unmap_and_move(get_new_page, put_new_page, + private, folio, pass > 2, mode, + reason, &ret_folios); /* * The rules are: - * Success: non hugetlb folio will be freed, hugetlb - * folio will be put back + * Success: folio will be freed * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list @@ -1532,7 +1630,6 @@ split_folio_migration: stats.nr_thp_split += is_thp; break; } - /* Hugetlb migration is unsupported */ } else if (!no_split_folio_counting) { nr_failed++; } @@ -1626,8 +1723,8 @@ split_folio_migration: */ if (!list_empty(&split_folios)) { /* - * Move non-migrated folios (after 10 retries) to ret_folios - * to avoid migrating them again. + * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY + * retries) to ret_folios to avoid migrating them again. */ list_splice_init(from, &ret_folios); list_splice_init(&split_folios, from); From 42012e0436d44aeb2e68f11a28ddd0ad3f38b61f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:38 +0800 Subject: [PATCH 482/505] migrate_pages: restrict number of pages to migrate in batch This is a preparation patch to batch the folio unmapping and moving for non-hugetlb folios. If we had batched the folio unmapping, all folios to be migrated would be unmapped before copying the contents and flags of the folios. If the folios that were passed to migrate_pages() were too many in unit of pages, the execution of the processes would be stopped for too long time, thus too long latency. For example, migrate_pages() syscall will call migrate_pages() with all folios of a process. To avoid this possible issue, in this patch, we restrict the number of pages to be migrated to be no more than HPAGE_PMD_NR. That is, the influence is at the same level of THP migration. Link: https://lkml.kernel.org/r/20230213123444.155149-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 400 ++++++++++++++++++++++++++++----------------------- 1 file changed, 219 insertions(+), 181 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 586a32bdaa71..d436f35fa145 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,6 +1414,11 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f return rc; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR +#else +#define NR_MAX_BATCHED_MIGRATION 512 +#endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 struct migrate_pages_stats { @@ -1515,6 +1520,186 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, return nr_failed; } +static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, struct list_head *ret_folios, + struct migrate_pages_stats *stats) +{ + int retry = 1; + int large_retry = 1; + int thp_retry = 1; + int nr_failed = 0; + int nr_retry_pages = 0; + int nr_large_failed = 0; + int pass = 0; + bool is_large = false; + bool is_thp = false; + struct folio *folio, *folio2; + int rc, nr_pages; + LIST_HEAD(split_folios); + bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_split_folio_counting = false; + +split_folio_migration: + for (pass = 0; + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); + pass++) { + retry = 0; + large_retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + + list_for_each_entry_safe(folio, folio2, from, lru) { + /* + * Large folio statistics is based on the source large + * folio. Capture required information that might get + * lost during migration. + */ + is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = unmap_and_move(get_new_page, put_new_page, + private, folio, pass > 2, mode, + reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list + * Other errno: put on ret_folios list + */ + switch(rc) { + /* + * Large folio migration might be unsupported or + * the allocation could've failed so we should retry + * on the same folio with the large folio split + * to normal folios. + * + * Split folios are put in split_folios, and + * we will migrate them after the rest of the + * list is processed. + */ + case -ENOSYS: + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; + stats->nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { + stats->nr_thp_split += is_thp; + break; + } + } else if (!no_split_folio_counting) { + nr_failed++; + } + + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + break; + case -ENOMEM: + /* + * When memory is low, don't bother to try to migrate + * other folios, just exit. + */ + if (is_large) { + nr_large_failed++; + stats->nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { + int ret = try_split_folio(folio, &split_folios); + + if (!ret) { + stats->nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split large folio to + * mitigate the failure of longterm pinning. + */ + large_retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; + break; + } + } + } else if (!no_split_folio_counting) { + nr_failed++; + } + + stats->nr_failed_pages += nr_pages + nr_retry_pages; + /* + * There might be some split folios of fail-to-migrate large + * folios left in split_folios list. Move them to ret_folios + * list so that they could be put back to the right list by + * the caller otherwise the folio refcnt will be leaked. + */ + list_splice_init(&split_folios, ret_folios); + /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; + stats->nr_thp_failed += thp_retry; + goto out; + case -EAGAIN: + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { + retry++; + } + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + break; + default: + /* + * Permanent failure (-EBUSY, etc.): + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not + * retried in the next outer loop. + */ + if (is_large) { + nr_large_failed++; + stats->nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { + nr_failed++; + } + + stats->nr_failed_pages += nr_pages; + break; + } + } + } + nr_failed += retry; + nr_large_failed += large_retry; + stats->nr_thp_failed += thp_retry; + stats->nr_failed_pages += nr_retry_pages; + /* + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a + * large folio is counted as 1 failure in the first round. + */ + if (!list_empty(&split_folios)) { + /* + * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY + * retries) to ret_folios to avoid migrating them again. + */ + list_splice_init(from, ret_folios); + list_splice_init(&split_folios, from); + no_split_folio_counting = true; + retry = 1; + goto split_folio_migration; + } + + rc = nr_failed + nr_large_failed; +out: + return rc; +} + /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration @@ -1545,195 +1730,48 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { - int retry = 1; - int large_retry = 1; - int thp_retry = 1; - int nr_failed; - int nr_retry_pages = 0; - int nr_large_failed = 0; - int pass = 0; - bool is_large = false; - bool is_thp = false; + int rc, rc_gather; + int nr_pages; struct folio *folio, *folio2; - int rc, nr_pages; + LIST_HEAD(folios); LIST_HEAD(ret_folios); - LIST_HEAD(split_folios); - bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_split_folio_counting = false; struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); - rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, - &stats, &ret_folios); - if (rc < 0) + + rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, + mode, reason, &stats, &ret_folios); + if (rc_gather < 0) goto out; - nr_failed = rc; - -split_folio_migration: - for (pass = 0; - pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); - pass++) { - retry = 0; - large_retry = 0; - thp_retry = 0; - nr_retry_pages = 0; - - list_for_each_entry_safe(folio, folio2, from, lru) { - /* Retried hugetlb folios will be kept in list */ - if (folio_test_hugetlb(folio)) { - list_move_tail(&folio->lru, &ret_folios); - continue; - } - - /* - * Large folio statistics is based on the source large - * folio. Capture required information that might get - * lost during migration. - */ - is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); - nr_pages = folio_nr_pages(folio); - - cond_resched(); - - rc = unmap_and_move(get_new_page, put_new_page, - private, folio, pass > 2, mode, - reason, &ret_folios); - /* - * The rules are: - * Success: folio will be freed - * -EAGAIN: stay on the from list - * -ENOMEM: stay on the from list - * -ENOSYS: stay on the from list - * Other errno: put on ret_folios list then splice to - * from list - */ - switch(rc) { - /* - * Large folio migration might be unsupported or - * the allocation could've failed so we should retry - * on the same folio with the large folio split - * to normal folios. - * - * Split folios are put in split_folios, and - * we will migrate them after the rest of the - * list is processed. - */ - case -ENOSYS: - /* Large folio migration is unsupported */ - if (is_large) { - nr_large_failed++; - stats.nr_thp_failed += is_thp; - if (!try_split_folio(folio, &split_folios)) { - stats.nr_thp_split += is_thp; - break; - } - } else if (!no_split_folio_counting) { - nr_failed++; - } - - stats.nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, &ret_folios); - break; - case -ENOMEM: - /* - * When memory is low, don't bother to try to migrate - * other folios, just exit. - */ - if (is_large) { - nr_large_failed++; - stats.nr_thp_failed += is_thp; - /* Large folio NUMA faulting doesn't split to retry. */ - if (!nosplit) { - int ret = try_split_folio(folio, &split_folios); - - if (!ret) { - stats.nr_thp_split += is_thp; - break; - } else if (reason == MR_LONGTERM_PIN && - ret == -EAGAIN) { - /* - * Try again to split large folio to - * mitigate the failure of longterm pinning. - */ - large_retry++; - thp_retry += is_thp; - nr_retry_pages += nr_pages; - break; - } - } - } else if (!no_split_folio_counting) { - nr_failed++; - } - - stats.nr_failed_pages += nr_pages + nr_retry_pages; - /* - * There might be some split folios of fail-to-migrate large - * folios left in split_folios list. Move them back to migration - * list so that they could be put back to the right list by - * the caller otherwise the folio refcnt will be leaked. - */ - list_splice_init(&split_folios, from); - /* nr_failed isn't updated for not used */ - nr_large_failed += large_retry; - stats.nr_thp_failed += thp_retry; - goto out; - case -EAGAIN: - if (is_large) { - large_retry++; - thp_retry += is_thp; - } else if (!no_split_folio_counting) { - retry++; - } - nr_retry_pages += nr_pages; - break; - case MIGRATEPAGE_SUCCESS: - stats.nr_succeeded += nr_pages; - stats.nr_thp_succeeded += is_thp; - break; - default: - /* - * Permanent failure (-EBUSY, etc.): - * unlike -EAGAIN case, the failed folio is - * removed from migration folio list and not - * retried in the next outer loop. - */ - if (is_large) { - nr_large_failed++; - stats.nr_thp_failed += is_thp; - } else if (!no_split_folio_counting) { - nr_failed++; - } - - stats.nr_failed_pages += nr_pages; - break; - } +again: + nr_pages = 0; + list_for_each_entry_safe(folio, folio2, from, lru) { + /* Retried hugetlb folios will be kept in list */ + if (folio_test_hugetlb(folio)) { + list_move_tail(&folio->lru, &ret_folios); + continue; } - } - nr_failed += retry; - nr_large_failed += large_retry; - stats.nr_thp_failed += thp_retry; - stats.nr_failed_pages += nr_retry_pages; - /* - * Try to migrate split folios of fail-to-migrate large folios, no - * nr_failed counting in this round, since all split folios of a - * large folio is counted as 1 failure in the first round. - */ - if (!list_empty(&split_folios)) { - /* - * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY - * retries) to ret_folios to avoid migrating them again. - */ - list_splice_init(from, &ret_folios); - list_splice_init(&split_folios, from); - no_split_folio_counting = true; - retry = 1; - goto split_folio_migration; - } - rc = nr_failed + nr_large_failed; + nr_pages += folio_nr_pages(folio); + if (nr_pages > NR_MAX_BATCHED_MIGRATION) + break; + } + if (nr_pages > NR_MAX_BATCHED_MIGRATION) + list_cut_before(&folios, from, &folio->lru); + else + list_splice_init(from, &folios); + rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, + mode, reason, &ret_folios, &stats); + list_splice_tail_init(&folios, &ret_folios); + if (rc < 0) { + rc_gather = rc; + goto out; + } + rc_gather += rc; + if (!list_empty(from)) + goto again; out: /* * Put the permanent failure folio back to migration list, they @@ -1746,7 +1784,7 @@ out: * are migrated successfully. */ if (list_empty(from)) - rc = 0; + rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); @@ -1760,7 +1798,7 @@ out: if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; - return rc; + return rc_gather; } struct page *alloc_migration_target(struct page *page, unsigned long private) From 64c8902ed4418317cd416c566f896bd4a92b2efc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:39 +0800 Subject: [PATCH 483/505] migrate_pages: split unmap_and_move() to _unmap() and _move() This is a preparation patch to batch the folio unmapping and moving. In this patch, unmap_and_move() is split to migrate_folio_unmap() and migrate_folio_move(). So, we can batch _unmap() and _move() in different loops later. To pass some information between unmap and move, the original unused dst->mapping and dst->private are used. Link: https://lkml.kernel.org/r/20230213123444.155149-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Xin Hao Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 + mm/migrate.c | 169 ++++++++++++++++++++++++++++++---------- 2 files changed, 129 insertions(+), 41 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index bdff950a8bb4..c88b96b48be7 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,6 +18,7 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 +#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration diff --git a/mm/migrate.c b/mm/migrate.c index d436f35fa145..5fd18a7cce62 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1027,11 +1027,53 @@ out: return rc; } -static int __unmap_and_move(struct folio *src, struct folio *dst, +/* + * To record some information during migration, we use some unused + * fields (mapping and private) of struct folio of the newly allocated + * destination folio. This is safe because nobody is using them + * except us. + */ +static void __migrate_folio_record(struct folio *dst, + unsigned long page_was_mapped, + struct anon_vma *anon_vma) +{ + dst->mapping = (void *)anon_vma; + dst->private = (void *)page_was_mapped; +} + +static void __migrate_folio_extract(struct folio *dst, + int *page_was_mappedp, + struct anon_vma **anon_vmap) +{ + *anon_vmap = (void *)dst->mapping; + *page_was_mappedp = (unsigned long)dst->private; + dst->mapping = NULL; + dst->private = NULL; +} + +/* Cleanup src folio upon migration success */ +static void migrate_folio_done(struct folio *src, + enum migrate_reason reason) +{ + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__folio_test_movable(src))) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); + + if (reason != MR_MEMORY_FAILURE) + /* We release the page in page_handle_poison. */ + folio_put(src); +} + +static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { int rc = -EAGAIN; - bool page_was_mapped = false; + int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); @@ -1107,8 +1149,8 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, src, mode); - goto out_unlock_both; + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return MIGRATEPAGE_UNMAP; } /* @@ -1133,11 +1175,42 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, 0); - page_was_mapped = true; + page_was_mapped = 1; } - if (!folio_mapped(src)) - rc = move_to_new_folio(dst, src, mode); + if (!folio_mapped(src)) { + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return MIGRATEPAGE_UNMAP; + } + + if (page_was_mapped) + remove_migration_ptes(src, src, false); + +out_unlock_both: + folio_unlock(dst); +out_unlock: + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); +out: + + return rc; +} + +static int __migrate_folio_move(struct folio *src, struct folio *dst, + enum migrate_mode mode) +{ + int rc; + int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(&src->page); + + __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + + rc = move_to_new_folio(dst, src, mode); + if (unlikely(!is_lru)) + goto out_unlock_both; /* * When successful, push dst to LRU immediately: so that if it @@ -1160,12 +1233,10 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, out_unlock_both: folio_unlock(dst); -out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); folio_unlock(src); -out: /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased @@ -1177,19 +1248,15 @@ out: return rc; } -/* - * Obtain the lock on folio, remove all ptes and migrate the folio - * to the newly allocated folio in dst. - */ -static int unmap_and_move(new_page_t get_new_page, - free_page_t put_new_page, - unsigned long private, struct folio *src, - int force, enum migrate_mode mode, - enum migrate_reason reason, - struct list_head *ret) +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { struct folio *dst; - int rc = MIGRATEPAGE_SUCCESS; + int rc = MIGRATEPAGE_UNMAP; struct page *newpage = NULL; if (!thp_migration_supported() && folio_test_transhuge(src)) @@ -1200,20 +1267,49 @@ static int unmap_and_move(new_page_t get_new_page, folio_clear_active(src); folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ - goto out; + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; } newpage = get_new_page(&src->page, private); if (!newpage) return -ENOMEM; dst = page_folio(newpage); + *dstp = dst; dst->private = NULL; - rc = __unmap_and_move(src, dst, force, mode); + rc = __migrate_folio_unmap(src, dst, force, mode); + if (rc == MIGRATEPAGE_UNMAP) + return rc; + + /* + * A folio that has not been unmapped will be restored to + * right list unless we want to retry. + */ + if (rc != -EAGAIN) + list_move_tail(&src->lru, ret); + + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); + + return rc; +} + +/* Migrate the folio to the newly allocated folio in dst. */ +static int migrate_folio_move(free_page_t put_new_page, unsigned long private, + struct folio *src, struct folio *dst, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) +{ + int rc; + + rc = __migrate_folio_move(src, dst, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(&dst->page, reason); -out: if (rc != -EAGAIN) { /* * A folio that has been migrated has all references @@ -1229,20 +1325,7 @@ out: * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { - /* - * Compaction can migrate also non-LRU folios which are - * not accounted to NR_ISOLATED_*. They can be recognized - * as __folio_test_movable - */ - if (likely(!__folio_test_movable(src))) - mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + - folio_is_file_lru(src), -folio_nr_pages(src)); - - if (reason != MR_MEMORY_FAILURE) - /* - * We release the folio in page_handle_poison. - */ - folio_put(src); + migrate_folio_done(src, reason); } else { if (rc != -EAGAIN) list_add_tail(&src->lru, ret); @@ -1534,7 +1617,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, int pass = 0; bool is_large = false; bool is_thp = false; - struct folio *folio, *folio2; + struct folio *folio, *folio2, *dst = NULL; int rc, nr_pages; LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); @@ -1561,9 +1644,13 @@ split_folio_migration: cond_resched(); - rc = unmap_and_move(get_new_page, put_new_page, - private, folio, pass > 2, mode, - reason, ret_folios); + rc = migrate_folio_unmap(get_new_page, put_new_page, private, + folio, &dst, pass > 2, mode, + reason, ret_folios); + if (rc == MIGRATEPAGE_UNMAP) + rc = migrate_folio_move(put_new_page, private, + folio, dst, mode, + reason, ret_folios); /* * The rules are: * Success: folio will be freed From 5dfab109d5193e6c224d96cabf90e9cc2c039884 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:40 +0800 Subject: [PATCH 484/505] migrate_pages: batch _unmap and _move In this patch the _unmap and _move stage of the folio migration is batched. That for, previously, it is, for each folio _unmap() _move() Now, it is, for each folio _unmap() for each folio _move() Based on this, we can batch the TLB flushing and use some hardware accelerator to copy folios between batched _unmap and batched _move stages. Link: https://lkml.kernel.org/r/20230213123444.155149-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 214 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 189 insertions(+), 25 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 5fd18a7cce62..ee3e21f1061c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1051,6 +1051,33 @@ static void __migrate_folio_extract(struct folio *dst, dst->private = NULL; } +/* Restore the source folio to the original state upon failure */ +static void migrate_folio_undo_src(struct folio *src, + int page_was_mapped, + struct anon_vma *anon_vma, + struct list_head *ret) +{ + if (page_was_mapped) + remove_migration_ptes(src, src, false); + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); + list_move_tail(&src->lru, ret); +} + +/* Restore the destination folio to the original state upon failure */ +static void migrate_folio_undo_dst(struct folio *dst, + free_page_t put_new_page, + unsigned long private) +{ + folio_unlock(dst); + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); +} + /* Cleanup src folio upon migration success */ static void migrate_folio_done(struct folio *src, enum migrate_reason reason) @@ -1069,8 +1096,8 @@ static void migrate_folio_done(struct folio *src, folio_put(src); } -static int __migrate_folio_unmap(struct folio *src, struct folio *dst, - int force, enum migrate_mode mode) +static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, + bool avoid_force_lock, enum migrate_mode mode) { int rc = -EAGAIN; int page_was_mapped = 0; @@ -1097,6 +1124,17 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, if (current->flags & PF_MEMALLOC) goto out; + /* + * We have locked some folios and are going to wait to lock + * this folio. To avoid a potential deadlock, let's bail + * out and not do that. The locked folios will be moved and + * unlocked, then we can wait to lock this folio. + */ + if (avoid_force_lock) { + rc = -EDEADLOCK; + goto out; + } + folio_lock(src); } @@ -1205,10 +1243,20 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); + struct list_head *prev; __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + prev = dst->lru.prev; + list_del(&dst->lru); rc = move_to_new_folio(dst, src, mode); + + if (rc == -EAGAIN) { + list_add(&dst->lru, prev); + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return rc; + } + if (unlikely(!is_lru)) goto out_unlock_both; @@ -1251,7 +1299,7 @@ out_unlock_both: /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct folio *src, - struct folio **dstp, int force, + struct folio **dstp, int force, bool avoid_force_lock, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { @@ -1279,7 +1327,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page *dstp = dst; dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, mode); + rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); if (rc == MIGRATEPAGE_UNMAP) return rc; @@ -1287,7 +1335,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc != -EAGAIN) + if (rc != -EAGAIN && rc != -EDEADLOCK) list_move_tail(&src->lru, ret); if (put_new_page) @@ -1326,9 +1374,8 @@ static int migrate_folio_move(free_page_t put_new_page, unsigned long private, */ if (rc == MIGRATEPAGE_SUCCESS) { migrate_folio_done(src, reason); - } else { - if (rc != -EAGAIN) - list_add_tail(&src->lru, ret); + } else if (rc != -EAGAIN) { + list_add_tail(&src->lru, ret); if (put_new_page) put_new_page(&dst->page, private); @@ -1603,12 +1650,16 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, return nr_failed; } +/* + * migrate_pages_batch() first unmaps folios in the from list as many as + * possible, then move the unmapped folios. + */ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct migrate_pages_stats *stats) { - int retry = 1; + int retry; int large_retry = 1; int thp_retry = 1; int nr_failed = 0; @@ -1617,13 +1668,19 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, int pass = 0; bool is_large = false; bool is_thp = false; - struct folio *folio, *folio2, *dst = NULL; - int rc, nr_pages; + struct folio *folio, *folio2, *dst = NULL, *dst2; + int rc, rc_saved, nr_pages; LIST_HEAD(split_folios); + LIST_HEAD(unmap_folios); + LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); bool no_split_folio_counting = false; + bool avoid_force_lock; -split_folio_migration: +retry: + rc_saved = 0; + avoid_force_lock = false; + retry = 1; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); pass++) { @@ -1645,16 +1702,15 @@ split_folio_migration: cond_resched(); rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, pass > 2, mode, - reason, ret_folios); - if (rc == MIGRATEPAGE_UNMAP) - rc = migrate_folio_move(put_new_page, private, - folio, dst, mode, - reason, ret_folios); + folio, &dst, pass > 2, avoid_force_lock, + mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed + * Unmap: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list + * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list * Other errno: put on ret_folios list @@ -1689,7 +1745,7 @@ split_folio_migration: case -ENOMEM: /* * When memory is low, don't bother to try to migrate - * other folios, just exit. + * other folios, move unmapped folios, then exit. */ if (is_large) { nr_large_failed++; @@ -1728,7 +1784,19 @@ split_folio_migration: /* nr_failed isn't updated for not used */ nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; - goto out; + rc_saved = rc; + if (list_empty(&unmap_folios)) + goto out; + else + goto move; + case -EDEADLOCK: + /* + * The folio cannot be locked for potential deadlock. + * Go move (and unlock) all locked folios. Then we can + * try again. + */ + rc_saved = rc; + goto move; case -EAGAIN: if (is_large) { large_retry++; @@ -1742,6 +1810,15 @@ split_folio_migration: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; + case MIGRATEPAGE_UNMAP: + /* + * We have locked some folios, don't force lock + * to avoid deadlock. + */ + avoid_force_lock = true; + list_move_tail(&folio->lru, &unmap_folios); + list_add_tail(&dst->lru, &dst_folios); + break; default: /* * Permanent failure (-EBUSY, etc.): @@ -1765,12 +1842,95 @@ split_folio_migration: nr_large_failed += large_retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; +move: + retry = 1; + for (pass = 0; + pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); + pass++) { + retry = 0; + large_retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + + dst = list_first_entry(&dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { + is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = migrate_folio_move(put_new_page, private, + folio, dst, mode, + reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the unmap_folios list + * Other errno: put on ret_folios list + */ + switch(rc) { + case -EAGAIN: + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { + retry++; + } + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + break; + default: + if (is_large) { + nr_large_failed++; + stats->nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { + nr_failed++; + } + + stats->nr_failed_pages += nr_pages; + break; + } + dst = dst2; + dst2 = list_next_entry(dst, lru); + } + } + nr_failed += retry; + nr_large_failed += large_retry; + stats->nr_thp_failed += thp_retry; + stats->nr_failed_pages += nr_retry_pages; + + if (rc_saved) + rc = rc_saved; + else + rc = nr_failed + nr_large_failed; +out: + /* Cleanup remaining folios */ + dst = list_first_entry(&dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { + int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + + __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + migrate_folio_undo_src(folio, page_was_mapped, anon_vma, + ret_folios); + list_del(&dst->lru); + migrate_folio_undo_dst(dst, put_new_page, private); + dst = dst2; + dst2 = list_next_entry(dst, lru); + } + /* * Try to migrate split folios of fail-to-migrate large folios, no * nr_failed counting in this round, since all split folios of a * large folio is counted as 1 failure in the first round. */ - if (!list_empty(&split_folios)) { + if (rc >= 0 && !list_empty(&split_folios)) { /* * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY * retries) to ret_folios to avoid migrating them again. @@ -1778,12 +1938,16 @@ split_folio_migration: list_splice_init(from, ret_folios); list_splice_init(&split_folios, from); no_split_folio_counting = true; - retry = 1; - goto split_folio_migration; + goto retry; } - rc = nr_failed + nr_large_failed; -out: + /* + * We have unlocked all locked folios, so we can force lock now, let's + * try again. + */ + if (rc == -EDEADLOCK) + goto retry; + return rc; } From 80562ba0d8378e89fe5836c28ea56c2aab3014e8 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:41 +0800 Subject: [PATCH 485/505] migrate_pages: move migrate_folio_unmap() Just move the position of the functions. There's no any functionality change. This is to make it easier to review the next patch via putting code near its position in the next patch. Link: https://lkml.kernel.org/r/20230213123444.155149-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 100 +++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ee3e21f1061c..0c7488ebe248 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1236,6 +1236,56 @@ out: return rc; } +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, bool avoid_force_lock, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) +{ + struct folio *dst; + int rc = MIGRATEPAGE_UNMAP; + struct page *newpage = NULL; + + if (!thp_migration_supported() && folio_test_transhuge(src)) + return -ENOSYS; + + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; + } + + newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + *dstp = dst; + + dst->private = NULL; + rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); + if (rc == MIGRATEPAGE_UNMAP) + return rc; + + /* + * A folio that has not been unmapped will be restored to + * right list unless we want to retry. + */ + if (rc != -EAGAIN && rc != -EDEADLOCK) + list_move_tail(&src->lru, ret); + + if (put_new_page) + put_new_page(&dst->page, private); + else + folio_put(dst); + + return rc; +} + static int __migrate_folio_move(struct folio *src, struct folio *dst, enum migrate_mode mode) { @@ -1296,56 +1346,6 @@ out_unlock_both: return rc; } -/* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - struct folio *dst; - int rc = MIGRATEPAGE_UNMAP; - struct page *newpage = NULL; - - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - - newpage = get_new_page(&src->page, private); - if (!newpage) - return -ENOMEM; - dst = page_folio(newpage); - *dstp = dst; - - dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); - if (rc == MIGRATEPAGE_UNMAP) - return rc; - - /* - * A folio that has not been unmapped will be restored to - * right list unless we want to retry. - */ - if (rc != -EAGAIN && rc != -EDEADLOCK) - list_move_tail(&src->lru, ret); - - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); - - return rc; -} - /* Migrate the folio to the newly allocated folio in dst. */ static int migrate_folio_move(free_page_t put_new_page, unsigned long private, struct folio *src, struct folio *dst, From ebe75e4751063dce6f61b579b43de86dcf7b7462 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:42 +0800 Subject: [PATCH 486/505] migrate_pages: share more code between _unmap and _move This is a code cleanup patch to reduce the duplicated code between the _unmap and _move stages of migrate_pages(). No functionality change is expected. Link: https://lkml.kernel.org/r/20230213123444.155149-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 207 +++++++++++++++++++++------------------------------ 1 file changed, 85 insertions(+), 122 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 0c7488ebe248..00713ccb6643 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1055,6 +1055,7 @@ static void __migrate_folio_extract(struct folio *dst, static void migrate_folio_undo_src(struct folio *src, int page_was_mapped, struct anon_vma *anon_vma, + bool locked, struct list_head *ret) { if (page_was_mapped) @@ -1062,16 +1063,20 @@ static void migrate_folio_undo_src(struct folio *src, /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - folio_unlock(src); - list_move_tail(&src->lru, ret); + if (locked) + folio_unlock(src); + if (ret) + list_move_tail(&src->lru, ret); } /* Restore the destination folio to the original state upon failure */ static void migrate_folio_undo_dst(struct folio *dst, + bool locked, free_page_t put_new_page, unsigned long private) { - folio_unlock(dst); + if (locked) + folio_unlock(dst); if (put_new_page) put_new_page(&dst->page, private); else @@ -1096,13 +1101,42 @@ static void migrate_folio_done(struct folio *src, folio_put(src); } -static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force, - bool avoid_force_lock, enum migrate_mode mode) +/* Obtain the lock on page, remove all ptes. */ +static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct folio *src, + struct folio **dstp, int force, bool avoid_force_lock, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { + struct folio *dst; int rc = -EAGAIN; + struct page *newpage = NULL; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); + bool locked = false; + bool dst_locked = false; + + if (!thp_migration_supported() && folio_test_transhuge(src)) + return -ENOSYS; + + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ + list_del(&src->lru); + migrate_folio_done(src, reason); + return MIGRATEPAGE_SUCCESS; + } + + newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + *dstp = dst; + + dst->private = NULL; if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) @@ -1137,6 +1171,7 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force folio_lock(src); } + locked = true; if (folio_test_writeback(src)) { /* @@ -1151,10 +1186,10 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force break; default: rc = -EBUSY; - goto out_unlock; + goto out; } if (!force) - goto out_unlock; + goto out; folio_wait_writeback(src); } @@ -1184,7 +1219,8 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force * This is much like races on refcount of oldpage: just don't BUG(). */ if (unlikely(!folio_trylock(dst))) - goto out_unlock; + goto out; + dst_locked = true; if (unlikely(!is_lru)) { __migrate_folio_record(dst, page_was_mapped, anon_vma); @@ -1206,7 +1242,7 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force if (!src->mapping) { if (folio_test_private(src)) { try_to_free_buffers(src); - goto out_unlock_both; + goto out; } } else if (folio_mapped(src)) { /* Establish migration ptes */ @@ -1221,73 +1257,25 @@ static int __migrate_folio_unmap(struct folio *src, struct folio *dst, int force return MIGRATEPAGE_UNMAP; } - if (page_was_mapped) - remove_migration_ptes(src, src, false); - -out_unlock_both: - folio_unlock(dst); -out_unlock: - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); - folio_unlock(src); out: - - return rc; -} - -/* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, int force, bool avoid_force_lock, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - struct folio *dst; - int rc = MIGRATEPAGE_UNMAP; - struct page *newpage = NULL; - - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - - newpage = get_new_page(&src->page, private); - if (!newpage) - return -ENOMEM; - dst = page_folio(newpage); - *dstp = dst; - - dst->private = NULL; - rc = __migrate_folio_unmap(src, dst, force, avoid_force_lock, mode); - if (rc == MIGRATEPAGE_UNMAP) - return rc; - /* * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ - if (rc != -EAGAIN && rc != -EDEADLOCK) - list_move_tail(&src->lru, ret); + if (rc == -EAGAIN || rc == -EDEADLOCK) + ret = NULL; - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); + migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); + migrate_folio_undo_dst(dst, dst_locked, put_new_page, private); return rc; } -static int __migrate_folio_move(struct folio *src, struct folio *dst, - enum migrate_mode mode) +/* Migrate the folio to the newly allocated folio in dst. */ +static int migrate_folio_move(free_page_t put_new_page, unsigned long private, + struct folio *src, struct folio *dst, + enum migrate_mode mode, enum migrate_reason reason, + struct list_head *ret) { int rc; int page_was_mapped = 0; @@ -1300,12 +1288,8 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, list_del(&dst->lru); rc = move_to_new_folio(dst, src, mode); - - if (rc == -EAGAIN) { - list_add(&dst->lru, prev); - __migrate_folio_record(dst, page_was_mapped, anon_vma); - return rc; - } + if (rc) + goto out; if (unlikely(!is_lru)) goto out_unlock_both; @@ -1319,70 +1303,49 @@ static int __migrate_folio_move(struct folio *src, struct folio *dst, * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ - if (rc == MIGRATEPAGE_SUCCESS) { - folio_add_lru(dst); - if (page_was_mapped) - lru_add_drain(); - } + folio_add_lru(dst); + if (page_was_mapped) + lru_add_drain(); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + remove_migration_ptes(src, dst, false); out_unlock_both: folio_unlock(dst); - /* Drop an anon_vma reference if we took one */ - if (anon_vma) - put_anon_vma(anon_vma); - folio_unlock(src); + set_page_owner_migrate_reason(&dst->page, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) - folio_put(dst); - - return rc; -} - -/* Migrate the folio to the newly allocated folio in dst. */ -static int migrate_folio_move(free_page_t put_new_page, unsigned long private, - struct folio *src, struct folio *dst, - enum migrate_mode mode, enum migrate_reason reason, - struct list_head *ret) -{ - int rc; - - rc = __migrate_folio_move(src, dst, mode); - if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(&dst->page, reason); - - if (rc != -EAGAIN) { - /* - * A folio that has been migrated has all references - * removed and will be freed. A folio that has not been - * migrated will have kept its references and be restored. - */ - list_del(&src->lru); - } + folio_put(dst); /* - * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the folio to right list unless - * we want to retry. + * A folio that has been migrated has all references removed + * and will be freed. */ - if (rc == MIGRATEPAGE_SUCCESS) { - migrate_folio_done(src, reason); - } else if (rc != -EAGAIN) { - list_add_tail(&src->lru, ret); + list_del(&src->lru); + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); + migrate_folio_done(src, reason); - if (put_new_page) - put_new_page(&dst->page, private); - else - folio_put(dst); + return rc; +out: + /* + * A folio that has not been migrated will be restored to + * right list unless we want to retry. + */ + if (rc == -EAGAIN) { + list_add(&dst->lru, prev); + __migrate_folio_record(dst, page_was_mapped, anon_vma); + return rc; } + migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); + migrate_folio_undo_dst(dst, true, put_new_page, private); + return rc; } @@ -1918,9 +1881,9 @@ out: __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); migrate_folio_undo_src(folio, page_was_mapped, anon_vma, - ret_folios); + true, ret_folios); list_del(&dst->lru); - migrate_folio_undo_dst(dst, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_page, private); dst = dst2; dst2 = list_next_entry(dst, lru); } From 7e12beb8ca2ac98b2ec42e0ea4b76cdc93b58654 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:43 +0800 Subject: [PATCH 487/505] migrate_pages: batch flushing TLB The TLB flushing will cost quite some CPU cycles during the folio migration in some situations. For example, when migrate a folio of a process with multiple active threads that run on multiple CPUs. After batching the _unmap and _move in migrate_pages(), the TLB flushing can be batched easily with the existing TLB flush batching mechanism. This patch implements that. We use the following test case to test the patch. On a 2-socket Intel server, - Run pmbench memory accessing benchmark - Run `migratepages` to migrate pages of pmbench between node 0 and node 1 back and forth. With the patch, the TLB flushing IPI reduces 99.1% during the test and the number of pages migrated successfully per second increases 291.7%. Haoxin helped to test the patchset on an ARM64 server with 128 cores, 2 NUMA nodes. Test results show that the page migration performance increases up to 78%. NOTE: TLB flushing is batched only for normal folios, not for THP folios. Because the overhead of TLB flushing for THP folios is much lower than that for normal folios (about 1/512 on x86 platform). Link: https://lkml.kernel.org/r/20230213123444.155149-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Xin Hao Reviewed-by: Zi Yan Reviewed-by: Xin Hao Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Alistair Popple Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 5 ++++- mm/rmap.c | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 00713ccb6643..2fa420e4f68c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1248,7 +1248,7 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); - try_to_migrate(src, 0); + try_to_migrate(src, TTU_BATCH_FLUSH); page_was_mapped = 1; } @@ -1806,6 +1806,9 @@ retry: stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: + /* Flush TLBs for all unmapped folios */ + try_to_unmap_flush(); + retry = 1; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); diff --git a/mm/rmap.c b/mm/rmap.c index 8287f2cc327d..15ae24585fc4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1952,7 +1952,21 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); /* Nuke the page table entry. */ - pteval = ptep_clear_flush(vma, address, pvmw.pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); + + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pvmw.pte); + } } /* Set the dirty flag on the folio now the pte is gone. */ @@ -2124,10 +2138,10 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and - * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags. + * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC))) + TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && From 6f7d760e86fa84862d749e36ebd29abf31f4f883 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 13 Feb 2023 20:34:44 +0800 Subject: [PATCH 488/505] migrate_pages: move THP/hugetlb migration support check to simplify code This is a code cleanup patch, no functionality change is expected. After the change, the line number reduces especially in the long migrate_pages_batch(). Link: https://lkml.kernel.org/r/20230213123444.155149-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Alistair Popple Reviewed-by: Zi Yan Cc: Yang Shi Cc: Baolin Wang Cc: Oscar Salvador Cc: Matthew Wilcox Cc: Bharata B Rao Cc: Xin Hao Cc: Minchan Kim Cc: Mike Kravetz Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/migrate.c | 83 +++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2fa420e4f68c..ef68a1aff35c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1117,9 +1117,6 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page bool locked = false; bool dst_locked = false; - if (!thp_migration_supported() && folio_test_transhuge(src)) - return -ENOSYS; - if (folio_ref_count(src) == 1) { /* Folio was freed from under us. So we are done. */ folio_clear_active(src); @@ -1380,16 +1377,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - /* - * Migratability of hugepages depends on architectures and their size. - * This check is necessary because some callers of hugepage migration - * like soft offline and memory hotremove don't walk through page - * tables or check whether the hugepage is pmd-based or not before - * kicking migration. - */ - if (!hugepage_migration_supported(page_hstate(hpage))) - return -ENOSYS; - if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_active_hugetlb(src); @@ -1556,6 +1543,20 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, cond_resched(); + /* + * Migratability of hugepages depends on architectures and + * their size. This check is necessary because some callers + * of hugepage migration like soft offline and memory + * hotremove don't walk through page tables or check whether + * the hugepage is pmd-based or not before kicking migration. + */ + if (!hugepage_migration_supported(folio_hstate(folio))) { + nr_failed++; + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + continue; + } + rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, &folio->page, pass > 2, mode, @@ -1565,16 +1566,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, * Success: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list - * -ENOSYS: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { - case -ENOSYS: - /* Hugetlb migration is unsupported */ - nr_failed++; - stats->nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, ret_folios); - break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate @@ -1664,6 +1658,28 @@ retry: cond_resched(); + /* + * Large folio migration might be unsupported or + * the allocation might be failed so we should retry + * on the same folio with the large folio split + * to normal folios. + * + * Split folios are put in split_folios, and + * we will migrate them after the rest of the + * list is processed. + */ + if (!thp_migration_supported() && is_thp) { + nr_large_failed++; + stats->nr_thp_failed++; + if (!try_split_folio(folio, &split_folios)) { + stats->nr_thp_split++; + continue; + } + stats->nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, ret_folios); + continue; + } + rc = migrate_folio_unmap(get_new_page, put_new_page, private, folio, &dst, pass > 2, avoid_force_lock, mode, reason, ret_folios); @@ -1675,36 +1691,9 @@ retry: * -EAGAIN: stay on the from list * -EDEADLOCK: stay on the from list * -ENOMEM: stay on the from list - * -ENOSYS: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { - /* - * Large folio migration might be unsupported or - * the allocation could've failed so we should retry - * on the same folio with the large folio split - * to normal folios. - * - * Split folios are put in split_folios, and - * we will migrate them after the rest of the - * list is processed. - */ - case -ENOSYS: - /* Large folio migration is unsupported */ - if (is_large) { - nr_large_failed++; - stats->nr_thp_failed += is_thp; - if (!try_split_folio(folio, &split_folios)) { - stats->nr_thp_split += is_thp; - break; - } - } else if (!no_split_folio_counting) { - nr_failed++; - } - - stats->nr_failed_pages += nr_pages; - list_move_tail(&folio->lru, ret_folios); - break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate From 9f550d78b40da21b4da515db4c37d8d7b12aa1a6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 13 Feb 2023 00:53:22 -0700 Subject: [PATCH 489/505] mm: multi-gen LRU: avoid futile retries Recall that the per-node memcg LRU has two generations and they alternate when the last memcg (of a given node) is moved from one to the other. Each generation is also sharded into multiple bins to improve scalability. A reclaimer starts with a random bin (in the old generation) and, if it fails, it will retry, i.e., to try the rest of the bins. If a reclaimer fails with the last memcg, it should move this memcg to the young generation first, which causes the generations to alternate, and then retry. Otherwise, the retries will be futile because all other bins are empty. Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao Reported-by: T.J. Mercier Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d4b9fd1ae0ed..34535bbd4fe9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5356,18 +5356,20 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { + int op; int gen; int bin; int first_bin; struct lruvec *lruvec; struct lru_gen_folio *lrugen; + struct mem_cgroup *memcg; const struct hlist_nulls_node *pos; - int op = 0; - struct mem_cgroup *memcg = NULL; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: + op = 0; + memcg = NULL; gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); rcu_read_lock(); @@ -5391,14 +5393,22 @@ restart: op = shrink_one(lruvec, sc); - if (sc->nr_reclaimed >= nr_to_reclaim) - goto success; - rcu_read_lock(); + + if (sc->nr_reclaimed >= nr_to_reclaim) + break; } rcu_read_unlock(); + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + if (sc->nr_reclaimed >= nr_to_reclaim) + return; + /* restart if raced with lru_gen_rotate_memcg() */ if (gen != get_nulls_value(pos)) goto restart; @@ -5407,11 +5417,6 @@ restart: bin = get_memcg_bin(bin + 1); if (bin != first_bin) goto restart; -success: - if (op) - lru_gen_rotate_memcg(lruvec, op); - - mem_cgroup_put(memcg); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) From 1bc67ca65b31bcb669c4eaca79b3c8d205bb212a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 12 Feb 2023 19:10:27 +0800 Subject: [PATCH 490/505] mm: page_alloc: call panic() when memoryless node allocation fails In free_area_init(), we will continue to run after allocation of memoryless node pgdat fails. However, in the subsequent process (such as when initializing zonelist), the case that NODE_DATA(nid) is NULL is not handled, which will cause panic. Instead of this, it's better to call panic() directly when the memory allocation fails during system boot. Link: https://lkml.kernel.org/r/20230212111027.95520-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21d820c42900..4b6bcec41c8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8405,11 +8405,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); - if (!pgdat) { - pr_err("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - continue; - } + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); free_area_init_memoryless_node(nid); From 44081c77e8a4aac9c5a010ed0d9ccdcf684041e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Feb 2023 11:30:24 +0100 Subject: [PATCH 491/505] maple_tree: reduce stack usage with gcc-9 and earlier gcc-10 changed the way inlining works to be less aggressive, but older versions run into an oversized stack frame warning whenever CONFIG_KASAN_STACK is enabled, as that forces variables from inlined callees to be non-overlapping: lib/maple_tree.c: In function 'mas_wr_bnode': lib/maple_tree.c:4320:1: error: the frame size of 1424 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] Change the annotations on mas_store_b_node() and mas_commit_b_node() to explicitly forbid inlining in this configuration, which is the same behavior that newer versions already have. Link: https://lkml.kernel.org/r/20230214103030.1051950-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Vernon Yang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5e9703189259..646297cae5d1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -146,6 +146,13 @@ struct maple_subtree_state { struct maple_big_node *bn; }; +#ifdef CONFIG_KASAN_STACK +/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ +#define noinline_for_kasan noinline_for_stack +#else +#define noinline_for_kasan inline +#endif + /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { @@ -2107,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, * * Return: The actual end of the data stored in @b_node */ -static inline void mas_store_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; @@ -3579,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, * @b_node: The maple big node * @end: The end of the data. */ -static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; From 2ef8ed7ddd2e6e69da7802be51af8ad71326a74f Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 14 Feb 2023 15:35:49 +0000 Subject: [PATCH 492/505] mm: percpu: fix incorrect size in pcpu_obj_full_size() The extra space which is used to store the obj_cgroup membership is only valid when kmemcg is enabled. The kmemcg can be disabled via the kernel parameter "cgroup.memory=nokmem" at boot time. This helper is also used in non-memcg code, for example the tracepoint, so we should fix it. It was found by code review when I was implementing bpf memory usage[1]. No real issue happens in production environment. [1]. https://lwn.net/Articles/921991/ Link: https://lkml.kernel.org/r/20230214153549.12291-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Roman Gushchin Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Vasily Averin Signed-off-by: Andrew Morton --- mm/percpu-internal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 70b1ea23f4d2..f9847c131998 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -4,6 +4,7 @@ #include #include +#include /* * pcpu_block_md is the metadata block struct. @@ -118,14 +119,15 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store - * obj_cgroup membership. Charge it too. + * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG_KMEM - extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + if (!mem_cgroup_kmem_disabled()) + extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; From 9325ddf90ec3a801c09da374b74532d4589a7346 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 14 Feb 2023 16:07:28 +0200 Subject: [PATCH 493/505] m68k/nommu: add missing definition of ARCH_PFN_OFFSET Patch series "fixups for generic implementation of pfn_valid()". Guenter reported boot failures on m68k-nommu and sh caused by the switch to the generic implementation of pfn_valid(): https://lore.kernel.org/all/20230212173513.GA4052259@roeck-us.net https://lore.kernel.org/all/20230212161320.GA3784076@roeck-us.net These are small fixups that address the issues. This patch (of 2): On m68k/nommu RAM does not necessarily start at 0x0 and when it does not pfn_valid() uses a wrong offset into the memory map which causes silent boot failures. Define ARCH_PFN_OFFSET to make pfn_valid() use the correct offset. Link: https://lkml.kernel.org/r/20230214140729.1649961-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20230214140729.1649961-2-rppt@kernel.org Fixes: d82f07f06cf8 ("m68k: use asm-generic/memory_model.h for both MMU and !MMU") Reported-by: Guenter Roeck Signed-off-by: Mike Rapoport (IBM) Acked-by: Greg Ungerer Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Rich Felker Cc: Yoshinori Sato Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton --- arch/m68k/include/asm/page_no.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 43ff6b109ebb..060e4c0e7605 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -28,6 +28,8 @@ extern unsigned long memory_end; #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) +#define ARCH_PFN_OFFSET PHYS_PFN(PAGE_OFFSET_RAW) + #endif /* __ASSEMBLY__ */ #endif /* _M68K_PAGE_NO_H */ From b4fb12e6c74791ac4c5c98b845628c576366b889 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 14 Feb 2023 16:07:29 +0200 Subject: [PATCH 494/505] sh: initialize max_mapnr sh never initializes max_mapnr which is used by the generic implementation of pfn_valid(). Initialize max_mapnr with set_max_mapnr() in sh::paging_init(). Link: https://lkml.kernel.org/r/20230214140729.1649961-3-rppt@kernel.org Fixes: e5080a967785 ("mm, arch: add generic implementation of pfn_valid() for FLATMEM") Reported-by: Guenter Roeck Signed-off-by: Mike Rapoport (IBM) Acked-by: John Paul Adrian Glaubitz Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 506784702430..bf1b54055316 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -301,6 +301,7 @@ void __init paging_init(void) */ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = __MEMORY_START >> PAGE_SHIFT; + set_max_mapnr(max_low_pfn - min_low_pfn); nodes_clear(node_online_map); From f7a449f779608efe1941a0e0c4bd7b5f57000be7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 13 Feb 2023 11:29:22 -0800 Subject: [PATCH 495/505] mm: memcontrol: rename memcg_kmem_enabled() Currently there are two kmem-related helper functions with a confusing semantics: memcg_kmem_enabled() and mem_cgroup_kmem_disabled(). The problem is that an obvious expectation memcg_kmem_enabled() == !mem_cgroup_kmem_disabled(), can be false. mem_cgroup_kmem_disabled() is similar to mem_cgroup_disabled(): it returns true only if CONFIG_MEMCG_KMEM is not set or the kmem accounting is disabled using a boot time kernel option "cgroup.memory=nokmem". It never changes the value dynamically. memcg_kmem_enabled() is different: it always returns false until the first non-root memory cgroup will get online (assuming the kernel memory accounting is enabled). It's goal is to improve the performance on systems without the cgroupfs mounted/memory controller enabled or on the systems with only the root memory cgroup. To make things more obvious and avoid potential bugs, let's rename memcg_kmem_enabled() to memcg_kmem_online(). Link: https://lkml.kernel.org/r/20230213192922.1146370-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Dennis Zhou Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 +++++++------- mm/memcontrol.c | 8 ++++---- mm/page_alloc.c | 8 ++++---- mm/percpu.c | 2 +- mm/slab.h | 10 +++++----- mm/vmscan.c | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35478695cabf..5567319027d1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1776,24 +1776,24 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); -extern struct static_key_false memcg_kmem_enabled_key; +extern struct static_key_false memcg_kmem_online_key; -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { - return static_branch_likely(&memcg_kmem_enabled_key); + return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } @@ -1814,7 +1814,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; rcu_read_lock(); @@ -1854,7 +1854,7 @@ static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) return NULL; } -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17335459d8dc..3e3cdb9bed95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,8 +345,8 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); -EXPORT_SYMBOL(memcg_kmem_enabled_key); +DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); +EXPORT_SYMBOL(memcg_kmem_online_key); #endif /** @@ -3034,7 +3034,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return NULL; if (PageMemcgKmem(page)) { @@ -3746,7 +3746,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); - static_branch_enable(&memcg_kmem_enabled_key); + static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b6bcec41c8f..4c9ab8b93b1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1410,7 +1410,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); page_table_check_free(page, order); @@ -1441,7 +1441,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free && free_page_is_bad(page)) bad++; @@ -5432,7 +5432,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto out; /* Bulk allocator does not support memcg accounting. */ - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) goto failed; /* Use the single page allocator for one page. */ @@ -5604,7 +5604,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; diff --git a/mm/percpu.c b/mm/percpu.c index acd78da0493b..28e07ede46f6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1625,7 +1625,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) + if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 63fb4c00d529..43966aa5fadf 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -494,7 +494,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return true; if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) @@ -535,7 +535,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, unsigned long off; size_t i; - if (!memcg_kmem_enabled() || !objcg) + if (!memcg_kmem_online() || !objcg) return; for (i = 0; i < size; i++) { @@ -567,7 +567,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, struct obj_cgroup **objcgs; int i; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; objcgs = slab_objcgs(slab); @@ -649,7 +649,7 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { - if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) memcg_alloc_slab_cgroups(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), @@ -659,7 +659,7 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) memcg_free_slab_cgroups(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), diff --git a/mm/vmscan.c b/mm/vmscan.c index 34535bbd4fe9..098c79129c42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -915,7 +915,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, } /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_enabled() && + if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; From 9701c9ff8311fed118fd09d962a90e254e761d97 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:56 +0100 Subject: [PATCH 496/505] kasan: mark addr_has_metadata __always_inline Patch series "objtool warning fixes", v2. These are three of the easier fixes for objtool warnings around kasan/kmsan/kcsan. I dropped one patch since Peter had come up with a better fix, and adjusted the changelog text based on feedback. This patch (of 3): When the compiler decides not to inline this function, objtool complains about incorrect UACCESS state: mm/kasan/generic.o: warning: objtool: __asan_load2+0x11: call to addr_has_metadata() with UACCESS enabled Link: https://lore.kernel.org/all/20230208164011.2287122-1-arnd@kernel.org/ Link: https://lkml.kernel.org/r/20230215130058.3836177-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 308fb70fd40a..8fae87ab99cc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -297,7 +297,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool addr_has_metadata(const void *addr) +static __always_inline bool addr_has_metadata(const void *addr) { return (kasan_reset_tag(addr) >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); @@ -316,7 +316,7 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -static inline bool addr_has_metadata(const void *addr) +static __always_inline bool addr_has_metadata(const void *addr) { return (is_vmalloc_addr(addr) || virt_addr_valid(addr)); } From e75a698859a3f62825af06987da6b3e6466e6e56 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:57 +0100 Subject: [PATCH 497/505] kmsan: disable ftrace in kmsan core code objtool warns about some suspicous code inside of kmsan: vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_n+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_n+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_1+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_1+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_2+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_2+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_4+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_4+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_load_8+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_metadata_ptr_for_store_8+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_instrument_asm_store+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_chain_origin+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_poison_alloca+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_warning+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: __msan_get_context_state+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_copy_to_user+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_unpoison_memory+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_unpoison_entry_regs+0x4: call to __fentry__() with UACCESS enabled vmlinux.o: warning: objtool: kmsan_report+0x4: call to __fentry__() with UACCESS enabled The Makefile contained a line to turn off ftrace for the entire directory, but this does not work. Replace it with individual lines, matching the approach in kasan. Link: https://lkml.kernel.org/r/20230215130058.3836177-3-arnd@kernel.org Signed-off-by: Arnd Bergmann Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Acked-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kmsan/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 98eab2856626..91cfdde642d1 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -14,7 +14,13 @@ CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING -CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) +# Disable ftrace to avoid recursion. +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_hooks.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_instrumentation.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) From d5d469247264e56960705dc5ae7e1d014861fe40 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Feb 2023 14:00:58 +0100 Subject: [PATCH 498/505] objtool: add UACCESS exceptions for __tsan_volatile_read/write A lot of the tsan helpers are already excempt from the UACCESS warnings, but some more functions were added that need the same thing: kernel/kcsan/core.o: warning: objtool: __tsan_volatile_read16+0x0: call to __tsan_unaligned_read16() with UACCESS enabled kernel/kcsan/core.o: warning: objtool: __tsan_volatile_write16+0x0: call to __tsan_unaligned_write16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_read16+0x4: call to __tsan_unaligned_read16() with UACCESS enabled vmlinux.o: warning: objtool: __tsan_unaligned_volatile_write16+0x4: call to __tsan_unaligned_write16() with UACCESS enabled As Marco points out, these functions don't even call each other explicitly but instead gcc (but not clang) notices the functions being identical and turns one symbol into a direct branch to the other. Link: https://lkml.kernel.org/r/20230215130058.3836177-4-arnd@kernel.org Fixes: 75d75b7a4d54 ("kcsan: Support distinguishing volatile accesses") Signed-off-by: Arnd Bergmann Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Josh Poimboeuf Cc: Kuan-Ying Lee Cc: Peter Zijlstra (Intel) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- tools/objtool/check.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4b7c8b33069e..b1a5f658673f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1186,6 +1186,8 @@ static const char *uaccess_safe_builtin[] = { "__tsan_atomic64_compare_exchange_val", "__tsan_atomic_thread_fence", "__tsan_atomic_signal_fence", + "__tsan_unaligned_read16", + "__tsan_unaligned_write16", /* KCOV */ "write_comp_data", "check_kcov_mode", From be2d57563822b7e00b2b16d9354637c4b6d6d5cc Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:34 +0800 Subject: [PATCH 499/505] mm: change to return bool for folio_isolate_lru() Patch series "Change the return value for page isolation functions", v3. Now the page isolation functions did not return a boolean to indicate success or not, instead it will return a negative error when failed to isolate a page. So below code used in most places seem a boolean success/failure thing, which can confuse people whether the isolation is successful. if (folio_isolate_lru(folio)) continue; Moreover the page isolation functions only return 0 or -EBUSY, and most users did not care about the negative error except for few users, thus we can convert all page isolation functions to return a boolean value, which can remove the confusion to make code more clear. No functional changes intended in this patch series. This patch (of 4): Now the folio_isolate_lru() did not return a boolean value to indicate isolation success or not, however below code checking the return value can make people think that it was a boolean success/failure thing, which makes people easy to make mistakes (see the fix patch[1]). if (folio_isolate_lru(folio)) continue; Thus it's better to check the negative error value expilictly returned by folio_isolate_lru(), which makes code more clear per Linus's suggestion[2]. Moreover Matthew suggested we can convert the isolation functions to return a boolean[3], since most users did not care about the negative error value, and can also remove the confusing of checking return value. So this patch converts the folio_isolate_lru() to return a boolean value, which means return 'true' to indicate the folio isolation is successful, and 'false' means a failure to isolation. Meanwhile changing all users' logic of checking the isolation state. No functional changes intended. [1] https://lore.kernel.org/all/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com/T/#u [2] https://lore.kernel.org/all/CAHk-=wiBrY+O-4=2mrbVyxR+hOqfdJ=Do6xoucfJ9_5az01L4Q@mail.gmail.com/ [3] https://lore.kernel.org/all/Y+sTFqwMNAjDvxw3@casper.infradead.org/ Link: https://lkml.kernel.org/r/cover.1676424378.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/8a4e3679ed4196168efadf7ea36c038f2f7d5aa9.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 2 +- mm/folio-compat.c | 8 +++++++- mm/gup.c | 2 +- mm/internal.h | 2 +- mm/khugepaged.c | 2 +- mm/madvise.c | 4 ++-- mm/mempolicy.c | 2 +- mm/vmscan.c | 10 +++++----- 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b4df9b9bcc0a..607bb69e526c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -246,7 +246,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) folio_clear_referenced(folio); folio_test_clear_young(folio); - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { folio_put(folio); continue; } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 18c48b557926..540373cf904e 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -115,9 +115,15 @@ EXPORT_SYMBOL(grab_cache_page_write_begin); int isolate_lru_page(struct page *page) { + bool ret; + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return -EBUSY; - return folio_isolate_lru((struct folio *)page); + ret = folio_isolate_lru((struct folio *)page); + if (ret) + return 0; + + return -EBUSY; } void putback_lru_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index b0885f70579c..eab18ba045db 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1939,7 +1939,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (folio_isolate_lru(folio)) + if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/internal.h b/mm/internal.h index dfb37e94e140..8645e8496537 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, * in mm/vmscan.c: */ int isolate_lru_page(struct page *page); -int folio_isolate_lru(struct folio *folio); +bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bd54b957f69a..15eebab0fbb5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1950,7 +1950,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_isolate_lru(folio)) { + if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } diff --git a/mm/madvise.c b/mm/madvise.c index 5a5a687d03c2..c2202f51e9dd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -406,7 +406,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else @@ -500,7 +500,7 @@ regular_folio: folio_clear_referenced(folio); folio_test_clear_young(folio); if (pageout) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0919c7a719d4..2751bc3310fd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1033,7 +1033,7 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, * expensive, so check the estimated mapcount of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { - if (!folio_isolate_lru(folio)) { + if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), diff --git a/mm/vmscan.c b/mm/vmscan.c index 098c79129c42..9c1c5e8b24b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2337,12 +2337,12 @@ move: * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * - * Return: 0 if the folio was removed from an LRU list. - * -EBUSY if the folio was not on an LRU list. + * Return: true if the folio was removed from an LRU list. + * false if the folio was not on an LRU list. */ -int folio_isolate_lru(struct folio *folio) +bool folio_isolate_lru(struct folio *folio) { - int ret = -EBUSY; + bool ret = false; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); @@ -2353,7 +2353,7 @@ int folio_isolate_lru(struct folio *folio) lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); - ret = 0; + ret = true; } return ret; From f7f9c00dfafffd7a5a1a5685e2d874c64913e2ed Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:35 +0800 Subject: [PATCH 500/505] mm: change to return bool for isolate_lru_page() The isolate_lru_page() can only return 0 or -EBUSY, and most users did not care about the negative error of isolate_lru_page(), except one user in add_page_for_migration(). So we can convert the isolate_lru_page() to return a boolean value, which can help to make the code more clear when checking the return value of isolate_lru_page(). Also convert all users' logic of checking the isolation state. No functional changes intended. Link: https://lkml.kernel.org/r/3074c1ab628d9dbf139b33f248a8bc253a3f95f0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/folio-compat.c | 12 +++--------- mm/internal.h | 2 +- mm/khugepaged.c | 2 +- mm/memcontrol.c | 4 ++-- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 8 +++++--- mm/migrate.c | 9 ++++++--- mm/migrate_device.c | 2 +- 8 files changed, 21 insertions(+), 22 deletions(-) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 540373cf904e..cabcd1de9ecb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -113,17 +113,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -int isolate_lru_page(struct page *page) +bool isolate_lru_page(struct page *page) { - bool ret; - if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) - return -EBUSY; - ret = folio_isolate_lru((struct folio *)page); - if (ret) - return 0; - - return -EBUSY; + return false; + return folio_isolate_lru((struct folio *)page); } void putback_lru_page(struct page *page) diff --git a/mm/internal.h b/mm/internal.h index 8645e8496537..fc01fd092ea5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,7 +187,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, /* * in mm/vmscan.c: */ -int isolate_lru_page(struct page *page); +bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15eebab0fbb5..987281ead49e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -636,7 +636,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (isolate_lru_page(page)) { + if (!isolate_lru_page(page)) { unlock_page(page); result = SCAN_DEL_PAGE_LRU; goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e3cdb9bed95..25f2465d5a37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6176,7 +6176,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; - if (!isolate_lru_page(page)) { + if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; @@ -6226,7 +6226,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (!device && isolate_lru_page(page)) + if (!device && !isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index db85c2d37f70..e504362fdb23 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -846,7 +846,7 @@ static const char * const action_page_types[] = { */ static int delete_from_lru_cache(struct page *p) { - if (!isolate_lru_page(p)) { + if (isolate_lru_page(p)) { /* * Clear sensible page flags, so that the buddy system won't * complain when the page is unpoison-and-freed. @@ -2513,7 +2513,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool lru = !__PageMovable(page); if (lru) - isolated = !isolate_lru_page(page); + isolated = isolate_lru_page(page); else isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a1e8c3e9ab08..5fc2dcf4e3ab 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1632,6 +1632,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct folio *folio; + bool isolated; if (!pfn_valid(pfn)) continue; @@ -1667,9 +1668,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ - if (PageLRU(page)) - ret = isolate_lru_page(page); - else + if (PageLRU(page)) { + isolated = isolate_lru_page(page); + ret = isolated ? 0 : -EBUSY; + } else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ list_add_tail(&page->lru, &source); diff --git a/mm/migrate.c b/mm/migrate.c index ef68a1aff35c..53010a142e7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2132,11 +2132,14 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, } } else { struct page *head; + bool isolated; head = compound_head(page); - err = isolate_lru_page(head); - if (err) + isolated = isolate_lru_page(head); + if (!isolated) { + err = -EBUSY; goto out_putpage; + } err = 1; list_add_tail(&head->lru, pagelist); @@ -2541,7 +2544,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; } - if (isolate_lru_page(page)) + if (!isolate_lru_page(page)) return 0; mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6c3740318a98..d30c9de60b0d 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -388,7 +388,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, allow_drain = false; } - if (isolate_lru_page(page)) { + if (!isolate_lru_page(page)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; From 9747b9e92418b61c2281561e0651803f1fad0159 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:36 +0800 Subject: [PATCH 501/505] mm: hugetlb: change to return bool for isolate_hugetlb() Now the isolate_hugetlb() only returns 0 or -EBUSY, and most users did not care about the negative value, thus we can convert the isolate_hugetlb() to return a boolean value to make code more clear when checking the hugetlb isolation state. Moreover converts 2 users which will consider the negative value returned by isolate_hugetlb(). No functional changes intended. [akpm@linux-foundation.org: shorten locked section, per SeongJae Park] Link: https://lkml.kernel.org/r/12a287c5bebc13df304387087bbecc6421510849.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/hugetlb.c | 13 ++++++++----- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 +++---- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index df6dd624ccfe..5f5e4177b2e0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct folio *folio, struct list_head *list); +bool isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -413,9 +413,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - return -EBUSY; + return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a01a9dbf445..07abcb6eb203 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2925,12 +2925,15 @@ retry: */ goto free_new; } else if (folio_ref_count(old_folio)) { + bool isolated; + /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_folio, list); + isolated = isolate_hugetlb(old_folio, list); + ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -3005,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7251,15 +7254,15 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct folio *folio, struct list_head *list) +bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - int ret = 0; + bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { - ret = -EBUSY; + ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e504362fdb23..8604753bc644 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2508,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2751bc3310fd..a256a241fd1d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -609,7 +609,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(folio, qp->pagelist) && + if (!isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate folio but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index 53010a142e7f..2db546a0618c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2095,6 +2095,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; struct page *page; int err; + bool isolated; mmap_read_lock(mm); err = -EFAULT; @@ -2126,13 +2127,11 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page_folio(page), pagelist); - if (!err) - err = 1; + isolated = isolate_hugetlb(page_folio(page), pagelist); + err = isolated ? 1 : -EBUSY; } } else { struct page *head; - bool isolated; head = compound_head(page); isolated = isolate_lru_page(head); From cd7755800eb54e8522f5e51f4e71e6494c1f1572 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:37 +0800 Subject: [PATCH 502/505] mm: change to return bool for isolate_movable_page() Now the isolate_movable_page() can only return 0 or -EBUSY, and no users will care about the negative return value, thus we can convert the isolate_movable_page() to return a boolean value to make the code more clear when checking the movable page isolation state. No functional changes intended. [akpm@linux-foundation.org: remove unneeded comment, per Matthew] Link: https://lkml.kernel.org/r/cb877f73f4fff8d309611082ec740a7065b1ade0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 +++--- mm/compaction.c | 2 +- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 10 +++++----- mm/migrate.c | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c88b96b48be7..6b252f519c86 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -71,7 +71,7 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); -extern int isolate_movable_page(struct page *page, isolate_mode_t mode); +extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -92,8 +92,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new, static inline struct page *alloc_migration_target(struct page *page, unsigned long private) { return NULL; } -static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) - { return -EBUSY; } +static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) diff --git a/mm/compaction.c b/mm/compaction.c index d73578af44cc..ad7409f70519 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -976,7 +976,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - if (!isolate_movable_page(page, mode)) + if (isolate_movable_page(page, mode)) goto isolate_success; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8604753bc644..a1ede7bdce95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2515,8 +2515,8 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) if (lru) isolated = isolate_lru_page(page); else - isolated = !isolate_movable_page(page, - ISOLATE_UNEVICTABLE); + isolated = isolate_movable_page(page, + ISOLATE_UNEVICTABLE); if (isolated) { list_add(&page->lru, pagelist); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5fc2dcf4e3ab..5f73fd894b89 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1668,18 +1668,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ - if (PageLRU(page)) { + if (PageLRU(page)) isolated = isolate_lru_page(page); - ret = isolated ? 0 : -EBUSY; - } else - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (!ret) { /* Success */ + else + isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + if (isolated) { list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); } else { + ret = -EBUSY; if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); diff --git a/mm/migrate.c b/mm/migrate.c index 2db546a0618c..9a101c7bb8ff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -58,7 +58,7 @@ #include "internal.h" -int isolate_movable_page(struct page *page, isolate_mode_t mode) +bool isolate_movable_page(struct page *page, isolate_mode_t mode) { struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; @@ -119,14 +119,14 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) folio_set_isolated(folio); folio_unlock(folio); - return 0; + return true; out_no_isolated: folio_unlock(folio); out_putfolio: folio_put(folio); out: - return -EBUSY; + return false; } static void putback_movable_folio(struct folio *folio) From 7a079ba20090ab50d2f4203ceccd1e0f4becd1a6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 15 Feb 2023 15:58:00 -0500 Subject: [PATCH 503/505] mm/uffd: fix comment in handling pte markers The comment is obsolete after f369b07c8614 ("mm/uffd: reset write protection when unregister with wp-mode", 2022-08-20). Remove it. Link: https://lkml.kernel.org/r/20230215205800.223549-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7a04a1130ec1..f456f3b5049c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3627,9 +3627,7 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region - * got unregistered - we can simply clear them. We can also do that - * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp - * ranges, but it should be more efficient to be done lazily here. + * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) return pte_marker_clear(vmf); From 32cf666eab720b597650d9dea6ff07e99cd36b3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 16 Feb 2023 17:07:03 +0000 Subject: [PATCH 504/505] mm/memory_hotplug: cleanup return value handing in do_migrate_range() Return value mechanism of do_migrate_range() is not very simple, while no caller of the function checks the return value. Make the function return nothing to be more simple, and cleanup related unnecessary code. Link: https://lkml.kernel.org/r/20230216170703.64574-1-sj@kernel.org Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Signed-off-by: SeongJae Park Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5f73fd894b89..db3b270254f1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1620,12 +1620,10 @@ found: return 0; } -static int -do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page, *head; - int ret = 0; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1679,7 +1677,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_lru(page)); } else { - ret = -EBUSY; if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); @@ -1693,6 +1690,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + int ret; /* * We have checked that migration range is on a single zone so @@ -1721,8 +1719,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_movable_pages(&source); } } - - return ret; } static int __init cmdline_parse_movable_node(char *p) From f9366f4c2a29d14f5992b195e268240c2deb116e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 16 Feb 2023 14:44:24 -0800 Subject: [PATCH 505/505] include/linux/migrate.h: remove unneeded externs As suggested by Matthew. Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6b252f519c86..6241a1596a75 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,16 +62,16 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION -extern void putback_movable_pages(struct list_head *l); +void putback_movable_pages(struct list_head *l); int migrate_folio_extra(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); -extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason, - unsigned int *ret_succeeded); -extern struct page *alloc_migration_target(struct page *page, unsigned long private); -extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); +int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); +struct page *alloc_migration_target(struct page *page, unsigned long private); +bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -142,8 +142,8 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, - struct vm_area_struct *vma, int node); +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node); #else static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node)