Merge branch 'akpm' (patches from Andrew)

Pull updates from Andrew Morton:
 "Most of -mm and quite a number of other subsystems: hotfixes, scripts,
  ocfs2, misc, lib, binfmt, init, reiserfs, exec, dma-mapping, kcov.

  MM is fairly quiet this time.  Holidays, I assume"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
  kcov: ignore fault-inject and stacktrace
  include/linux/io-mapping.h-mapping: use PHYS_PFN() macro in io_mapping_map_atomic_wc()
  execve: warn if process starts with executable stack
  reiserfs: prevent NULL pointer dereference in reiserfs_insert_item()
  init/main.c: fix misleading "This architecture does not have kernel memory protection" message
  init/main.c: fix quoted value handling in unknown_bootoption
  init/main.c: remove unnecessary repair_env_string in do_initcall_level
  init/main.c: log arguments and environment passed to init
  fs/binfmt_elf.c: coredump: allow process with empty address space to coredump
  fs/binfmt_elf.c: coredump: delete duplicated overflow check
  fs/binfmt_elf.c: coredump: allocate core ELF header on stack
  fs/binfmt_elf.c: make BAD_ADDR() unlikely
  fs/binfmt_elf.c: better codegen around current->mm
  fs/binfmt_elf.c: don't copy ELF header around
  fs/binfmt_elf.c: fix ->start_code calculation
  fs/binfmt_elf.c: smaller code generation around auxv vector fill
  lib/find_bit.c: uninline helper _find_next_bit()
  lib/find_bit.c: join _find_next_bit{_le}
  uapi: rename ext2_swab() to swab() and share globally in swab.h
  lib/scatterlist.c: adjust indentation in __sg_alloc_table
  ...
This commit is contained in:
Linus Torvalds
2020-01-31 12:16:36 -08:00
136 changed files with 2722 additions and 1291 deletions

View File

@@ -20,6 +20,7 @@ KCOV_INSTRUMENT_kmemleak.o := n
KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
KCOV_INSTRUMENT_failslab.o := n
CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)

View File

@@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = {
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU

View File

@@ -46,7 +46,15 @@ void __dump_page(struct page *page, const char *reason)
{
struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
/*
* Accessing the pageblock without the zone lock. It could change to
* "isolate" again in the meantime, but since we are just dumping the
* state for debugging, it should be fine to accept a bit of
* inaccuracy here due to racing.
*/
bool page_cma = is_migrate_cma_page(page);
int mapcount;
char *type = "";
/*
* If struct page is poisoned don't access Page*() functions as that
@@ -78,9 +86,9 @@ void __dump_page(struct page *page, const char *reason)
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageKsm(page))
pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
type = "ksm ";
else if (PageAnon(page))
pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
type = "anon ";
else if (mapping) {
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
@@ -88,10 +96,12 @@ void __dump_page(struct page *page, const char *reason)
pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
} else
pr_warn("%ps\n", mapping->a_ops);
pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
page_cma ? " CMA" : "");
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,

View File

@@ -121,8 +121,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
}
}
if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
__func__, (u64)phys_addr, size))
if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n",
__func__, &phys_addr, size))
return NULL;
/* Don't allow wraparound or zero size */
@@ -158,8 +158,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
--idx;
--nrpages;
}
WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
__func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
__func__, &phys_addr, size, slot, offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];

View File

@@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping)
return mapping->nrpages;
}
int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
if (mapping_needs_writeback(mapping)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
if (err != -EIO) {
int err2 = filemap_fdatawait(mapping);
if (!err)
err = err2;
} else {
/* Clear any previously stored errors */
filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
}
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait);
/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
@@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
/*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
if (err != -EIO) {
int err2 = filemap_fdatawait_range(mapping,
lstart, lend);

501
mm/gup.c
View File

@@ -29,8 +29,23 @@ struct follow_page_context {
unsigned int page_mask;
};
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
return head;
}
/**
* put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
* @make_dirty: whether to mark the pages dirty
@@ -40,19 +55,19 @@ struct follow_page_context {
*
* For each page in the @pages array, make that page (or its head page, if a
* compound page) dirty, if @make_dirty is true, and if the page was previously
* listed as clean. In any case, releases all pages using put_user_page(),
* possibly via put_user_pages(), for the non-dirty case.
* listed as clean. In any case, releases all pages using unpin_user_page(),
* possibly via unpin_user_pages(), for the non-dirty case.
*
* Please see the put_user_page() documentation for details.
* Please see the unpin_user_page() documentation for details.
*
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
* required, then the caller should a) verify that this is really correct,
* because _lock() is usually required, and b) hand code it:
* set_page_dirty_lock(), put_user_page().
* set_page_dirty_lock(), unpin_user_page().
*
*/
void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
unsigned long index;
@@ -63,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!make_dirty) {
put_user_pages(pages, npages);
unpin_user_pages(pages, npages);
return;
}
@@ -91,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!PageDirty(page))
set_page_dirty_lock(page);
put_user_page(page);
unpin_user_page(page);
}
}
EXPORT_SYMBOL(put_user_pages_dirty_lock);
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
* put_user_pages() - release an array of gup-pinned pages.
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
*
* For each page in the @pages array, release the page using put_user_page().
* For each page in the @pages array, release the page using unpin_user_page().
*
* Please see the put_user_page() documentation for details.
* Please see the unpin_user_page() documentation for details.
*/
void put_user_pages(struct page **pages, unsigned long npages)
void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
@@ -115,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages)
* single operation to the head page should suffice.
*/
for (index = 0; index < npages; index++)
put_user_page(pages[index]);
unpin_user_page(pages[index]);
}
EXPORT_SYMBOL(put_user_pages);
EXPORT_SYMBOL(unpin_user_pages);
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
@@ -179,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t *ptep, pte;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -323,7 +342,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
@@ -433,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
@@ -796,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start = untagged_addr(start);
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
@@ -1020,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
BUG_ON(*locked != 1);
}
if (pages)
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
* is to set FOLL_GET if the caller wants pages[] filled in (but has
* carelessly failed to specify FOLL_GET), so keep doing that, but only
* for FOLL_GET, not for the newer FOLL_PIN.
*
* FOLL_PIN always expects pages to be non-null, but no need to assert
* that here, as any failures will be obvious enough.
*/
if (pages && !(flags & FOLL_PIN))
flags |= FOLL_GET;
pages_done = 0;
@@ -1096,88 +1124,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
return pages_done;
}
/*
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
*
* -- If nr_pages is 0, returns 0.
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
* @vmas are valid only as long as mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
* get_user_pages is typically used for fewer-copy IO operations, to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses. The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping (via the kmap APIs). Care should be taken to
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
* get_user_pages should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
/*
* FIXME: Current FOLL_LONGTERM behavior is incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. As there are no users of this flag in this call we simply
* disallow this option for now.
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
@@ -1611,6 +1557,116 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
/*
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
*
* -- If nr_pages is 0, returns 0.
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
* @vmas are valid only as long as mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
* get_user_pages is typically used for fewer-copy IO operations, to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses. The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping (via the kmap APIs). Care should be taken to
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
* get_user_pages should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
#ifdef CONFIG_MMU
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
/*
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
* never directly by the caller, so enforce that with an assertion:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
/*
* Parts of FOLL_LONGTERM behavior are incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. However, this only comes up if locked is set, and there are
* callers that do request FOLL_LONGTERM, but do not set locked. So,
* allow what we can.
*/
if (gup_flags & FOLL_LONGTERM) {
if (WARN_ON_ONCE(locked))
return -EINVAL;
/*
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
return 0;
}
#endif /* !CONFIG_MMU */
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
@@ -1622,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
/*
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
* never directly by the caller, so enforce that with an assertion:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
return __gup_longterm_locked(current, current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
@@ -1807,20 +1870,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
}
/*
* Return the compund head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
return head;
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
@@ -1978,6 +2027,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
static int record_subpages(struct page *page, unsigned long addr,
unsigned long end, struct page **pages)
{
int nr;
for (nr = 0; addr != end; addr += PAGE_SIZE)
pages[nr++] = page++;
return nr;
}
static void put_compound_head(struct page *page, int refs)
{
VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
/*
* Calling put_page() for each ref is unnecessarily slow. Only the last
* ref needs a put_page().
*/
if (refs > 1)
page_ref_sub(page, refs - 1);
put_page(page);
}
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2007,32 +2079,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
head = pte_page(pte);
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(head, refs);
if (!head) {
*nr -= refs;
if (!head)
return 0;
}
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
/* Could be optimized better */
*nr -= refs;
while (refs--)
put_page(head);
put_compound_head(head, refs);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2079,28 +2139,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pmd_page(orig), refs);
if (!head) {
*nr -= refs;
if (!head)
return 0;
}
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
*nr -= refs;
while (refs--)
put_page(head);
put_compound_head(head, refs);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2120,28 +2171,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pud_page(orig), refs);
if (!head) {
*nr -= refs;
if (!head)
return 0;
}
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
*nr -= refs;
while (refs--)
put_page(head);
put_compound_head(head, refs);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2157,28 +2199,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
do {
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pgd_page(orig), refs);
if (!head) {
*nr -= refs;
if (!head)
return 0;
}
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
*nr -= refs;
while (refs--)
put_page(head);
put_compound_head(head, refs);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2237,7 +2271,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
if (unlikely(!pud_present(pud)))
return 0;
if (unlikely(pud_huge(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
@@ -2393,29 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
return ret;
}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags,
struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
@@ -2455,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number requested.
* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
* -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
/*
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
* never directly by the caller, so enforce that:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
/**
* pin_user_pages_fast() - pin user pages in memory without taking locks
*
* For now, this is a placeholder function, until various call sites are
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
* this is identical to get_user_pages_fast().
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
/*
* This is a placeholder, until the pin functionality is activated.
* Until then, just behave like the corresponding get_user_pages*()
* routine.
*/
return get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
/**
* pin_user_pages_remote() - pin pages of a remote process (task != current)
*
* For now, this is a placeholder function, until various call sites are
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
* this is identical to get_user_pages_remote().
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
/*
* This is a placeholder, until the pin functionality is activated.
* Until then, just behave like the corresponding get_user_pages*()
* routine.
*/
return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
/**
* pin_user_pages() - pin user pages in memory for use by other devices
*
* For now, this is a placeholder function, until various call sites are
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
* this is identical to get_user_pages().
*
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
/*
* This is a placeholder, until the pin functionality is activated.
* Until then, just behave like the corresponding get_user_pages*()
* routine.
*/
return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
}
EXPORT_SYMBOL(pin_user_pages);

View File

@@ -49,18 +49,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
/* Filter out most gup flags: only allow a tiny subset here: */
gup->flags &= FOLL_WRITE;
switch (cmd) {
case GUP_FAST_BENCHMARK:
nr = get_user_pages_fast(addr, nr, gup->flags & 1,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
nr = get_user_pages(addr, nr,
(gup->flags & 1) | FOLL_LONGTERM,
gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
default:

View File

@@ -177,16 +177,13 @@ static ssize_t enabled_store(struct kobject *kobj,
{
ssize_t ret = count;
if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
} else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
} else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else
@@ -250,32 +247,27 @@ static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("defer+madvise", buf,
min(sizeof("defer+madvise")-1, count))) {
} else if (sysfs_streq(buf, "defer+madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("defer", buf,
min(sizeof("defer")-1, count))) {
} else if (sysfs_streq(buf, "defer")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
} else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
} else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -2715,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2723,11 +2715,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unsigned long flags;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(head), head);
VM_BUG_ON_PAGE(!PageCompound(head), head);
if (PageWriteback(page))
if (PageWriteback(head))
return -EBUSY;
if (PageAnon(head)) {
@@ -2778,7 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
mlocked = PageMlocked(page);
mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
@@ -2810,14 +2802,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
if (PageSwapBacked(head))
__dec_node_page_state(head, NR_SHMEM_THPS);
else
__dec_node_page_state(page, NR_FILE_THPS);
__dec_node_page_state(head, NR_FILE_THPS);
}
spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };

View File

@@ -13,7 +13,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
* - kmemleak_lock (rwlock): protects the object_list modifications and
* - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -22,13 +22,13 @@
* object_tree_root in the create_object() function called from the
* kmemleak_alloc() callback and removed in delete_object() called from the
* kmemleak_free() callback
* - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
* the metadata (e.g. count) are protected by this lock. Note that some
* members of this structure may be protected by other means (atomic or
* kmemleak_lock). This lock is also held when scanning the corresponding
* memory block to avoid the kernel freeing it via the kmemleak_free()
* callback. This is less heavyweight than holding a global lock like
* kmemleak_lock during scanning
* - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
* Accesses to the metadata (e.g. count) are protected by this lock. Note
* that some members of this structure may be protected by other means
* (atomic or kmemleak_lock). This lock is also held when scanning the
* corresponding memory block to avoid the kernel freeing it via the
* kmemleak_free() callback. This is less heavyweight than holding a global
* lock like kmemleak_lock during scanning.
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
* unreferenced objects at a time. The gray_list contains the objects which
* are already referenced or marked as false positives and need to be
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
spinlock_t lock;
raw_spinlock_t lock;
unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
static DEFINE_RWLOCK(kmemleak_lock);
/* protecting the access to object_list and object_tree_root */
static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
}
/* slab allocation failed, try the memory pool */
write_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = list_first_entry_or_null(&mem_pool_free_list,
typeof(*object), object_list);
if (object)
@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
object = &mem_pool[--mem_pool_free_count];
else
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
write_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemleak_object *object)
}
/* add the object to the memory pool free list */
write_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
list_add(&object->object_list, &mem_pool_free_list);
write_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
struct kmemleak_object *object;
rcu_read_lock();
read_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
read_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
unsigned long flags;
struct kmemleak_object *object;
write_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object)
__remove_object(object);
write_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -585,7 +585,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
spin_lock_init(&object->lock);
raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
@@ -617,7 +617,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
write_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
min_addr = min(min_addr, untagged_ptr);
@@ -649,7 +649,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
write_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -667,9 +667,9 @@ static void __delete_object(struct kmemleak_object *object)
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_object *object, int color)
{
unsigned long flags;
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if (!area) {
pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
/* mark the object for full scan to avoid false positives */
@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
return;
}
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
object->excess_ref = excess_ref;
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long ptr)
return;
}
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const void *ptr)
return;
}
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, void *_end,
unsigned long flags;
unsigned long untagged_ptr;
read_lock_irqsave(&kmemleak_lock, flags);
raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, void *_end,
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
/* only pass surplus references (object already gray) */
if (color_gray(object)) {
excess_ref = object->excess_ref;
@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, void *_end,
excess_ref = 0;
update_refs(object);
}
spin_unlock(&object->lock);
raw_spin_unlock(&object->lock);
if (excess_ref) {
object = lookup_object(excess_ref, 0);
@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, void *_end,
if (object == scanned)
/* circular reference, ignore */
continue;
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
update_refs(object);
spin_unlock(&object->lock);
raw_spin_unlock(&object->lock);
}
}
read_unlock_irqrestore(&kmemleak_lock, flags);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_object *object)
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_object *object)
if (start >= end)
break;
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_object *object)
(void *)(area->start + area->size),
object);
out:
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
/*
@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void)
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
struct kmemleak_object *object = v;
unsigned long flags;
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const char *str)
return -EINVAL;
}
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void)
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
spin_lock_irqsave(&object->lock, flags);
raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
spin_unlock_irqrestore(&object->lock, flags);
raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();

View File

@@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* Return:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_range(struct memblock_type *type,
static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
@@ -694,7 +694,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -795,7 +795,7 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_remove_range(&memblock.memory, base, size);
@@ -813,7 +813,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -824,12 +824,24 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
}
#endif
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @base: base address of the region

View File

@@ -5340,14 +5340,6 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (compound && !list_empty(page_deferred_list(page))) {
spin_lock(&from->deferred_split_queue.split_queue_lock);
list_del_init(page_deferred_list(page));
from->deferred_split_queue.split_queue_len--;
spin_unlock(&from->deferred_split_queue.split_queue_lock);
}
#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -5357,16 +5349,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
page->mem_cgroup = to;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (compound && list_empty(page_deferred_list(page))) {
spin_lock(&to->deferred_split_queue.split_queue_lock);
list_add_tail(page_deferred_list(page),
&to->deferred_split_queue.split_queue);
to->deferred_split_queue.split_queue_len++;
spin_unlock(&to->deferred_split_queue.split_queue_lock);
}
#endif
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -6651,7 +6633,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
bool compound;
unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6673,8 +6654,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Force-charge the new page. The old one will be freed soon */
compound = PageTransHuge(newpage);
nr_pages = compound ? hpage_nr_pages(newpage) : 1;
nr_pages = hpage_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
@@ -6684,7 +6664,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
local_irq_save(flags);
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}

View File

@@ -783,27 +783,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid)
{
unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
int nid;
int ret;
struct memory_notify arg;
struct memory_block *mem;
mem_hotplug_begin();
/*
* We can't use pfn_to_nid() because nid might be stored in struct page
* which is not yet initialized. Instead, we find nid from memory block.
*/
mem = find_memory_block(__pfn_to_section(pfn));
nid = mem->nid;
put_device(&mem->dev);
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
@@ -1182,7 +1173,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
MEMORY_OFFLINE);
}
@@ -1764,8 +1755,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
mem_hotplug_begin();
/*
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
@@ -1778,9 +1767,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
/* remove memory block devices before removing memory */
/*
* Memory block device removal under the device_hotplug_lock is
* a barrier against racing online attempts.
*/
remove_memory_block_devices(start, size);
mem_hotplug_begin();
arch_remove_memory(nid, start, size, NULL);
memblock_free(start, size);
memblock_remove(start, size);

View File

@@ -2821,6 +2821,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
char *flags = strchr(str, '=');
int err = 1, mode;
if (flags)
*flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
@@ -2831,9 +2834,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
} else
nodes_clear(nodes);
if (flags)
*flags++ = '\0'; /* terminate mode string */
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;

View File

@@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void)
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
if (!pgmap->ops || !pgmap->ops->page_free) {
if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
(!pgmap->ops || !pgmap->ops->page_free)) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
@@ -410,48 +411,42 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
void __put_devmap_managed_page(struct page *page)
void free_devmap_managed_page(struct page *page)
{
int count = page_ref_dec_return(page);
/* notify page idle for dax */
if (!is_device_private_page(page)) {
wake_up_var(&page->_refcount);
return;
}
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive(page);
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
/*
* If refcount is 1 then page is freed and refcount is stable as nobody
* holds a reference on the page.
* When a device_private page is freed, the page->mapping field
* may still contain a (stale) mapping value. For example, the
* lower bits of page->mapping may still identify the page as an
* anonymous page. Ultimately, this entire field is just stale
* and wrong, and it will cause errors if not cleared. One
* example is:
*
* migrate_vma_pages()
* migrate_vma_insert_page()
* page_add_new_anon_rmap()
* __page_set_anon_rmap()
* ...checks page->mapping, via PageAnon(page) call,
* and incorrectly concludes that the page is an
* anonymous page. Therefore, it incorrectly,
* silently fails to set up the new anon rmap.
*
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear page->mapping.
*/
if (count == 1) {
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive(page);
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
/*
* When a device_private page is freed, the page->mapping field
* may still contain a (stale) mapping value. For example, the
* lower bits of page->mapping may still identify the page as
* an anonymous page. Ultimately, this entire field is just
* stale and wrong, and it will cause errors if not cleared.
* One example is:
*
* migrate_vma_pages()
* migrate_vma_insert_page()
* page_add_new_anon_rmap()
* __page_set_anon_rmap()
* ...checks page->mapping, via PageAnon(page) call,
* and incorrectly concludes that the page is an
* anonymous page. Therefore, it incorrectly,
* silently fails to set up the new anon rmap.
*
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear page->mapping.
*/
if (is_device_private_page(page))
page->mapping = NULL;
page->pgmap->ops->page_free(page);
} else if (!count)
__put_page(page);
page->mapping = NULL;
page->pgmap->ops->page_free(page);
}
EXPORT_SYMBOL(__put_devmap_managed_page);
#endif /* CONFIG_DEV_PAGEMAP_OPS */

View File

@@ -48,6 +48,7 @@
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -986,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
}
/*
* Anonymous and movable page->mapping will be cleard by
* Anonymous and movable page->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
@@ -1199,8 +1200,7 @@ out:
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
* migrated will have kepts its references and be
* restored.
* migrated will have kept its references and be restored.
*/
list_del(&page->lru);
@@ -1627,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, &pagelist, current_node);
if (err)
if (err) {
/*
* Positive err means the number of failed
* pages to migrate. Since we are going to
* abort and return the number of non-migrated
* pages, so need to incude the rest of the
* nr_pages that have not been attempted as
* well.
*/
if (err > 0)
err += nr_pages - i - 1;
goto out;
}
err = store_status(status, start, current_node, i - start);
if (err)
goto out;
@@ -1659,8 +1670,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
err = do_move_pages_to_node(mm, &pagelist, current_node);
if (err)
if (err) {
if (err > 0)
err += nr_pages - i - 1;
goto out;
}
if (i > start) {
err = store_status(status, start, current_node, i - start);
if (err)
@@ -1674,9 +1688,16 @@ out_flush:
/* Make sure we do not overwrite the existing error */
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
/*
* Don't have to report non-attempted pages here since:
* - If the above loop is done gracefully all pages have been
* attempted.
* - If the above loop is aborted it means a fatal error
* happened, should return ret.
*/
if (!err1)
err1 = store_status(status, start, current_node, i - start);
if (!err)
if (err >= 0)
err = err1;
out:
return err;
@@ -2135,7 +2156,7 @@ static int migrate_vma_collect_hole(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
migrate->npages++;
@@ -2152,7 +2173,7 @@ static int migrate_vma_collect_skip(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2675,6 +2696,14 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
/*
* This code closely matches the code in:
* __handle_mm_fault()
* handle_pte_fault()
* do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
* private page.
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2755,30 +2784,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
if (check_stable_address_space(mm))
goto unlock_abort;
if (pte_present(*ptep)) {
unsigned long pfn = pte_pfn(*ptep);
if (!is_zero_pfn(pfn)) {
pte_unmap_unlock(ptep, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
goto abort;
}
if (!is_zero_pfn(pfn))
goto unlock_abort;
flush = true;
} else if (!pte_none(*ptep)) {
pte_unmap_unlock(ptep, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
goto abort;
}
} else if (!pte_none(*ptep))
goto unlock_abort;
/*
* Check for usefaultfd but do not deliver the fault. Instead,
* Check for userfaultfd but do not deliver the fault. Instead,
* just back off.
*/
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(ptep, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
goto abort;
}
if (userfaultfd_missing(vma))
goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
@@ -2802,6 +2825,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*src = MIGRATE_PFN_MIGRATE;
return;
unlock_abort:
pte_unmap_unlock(ptep, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
@@ -2834,9 +2860,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
if (!page) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
}
if (!notified) {
notified = true;

View File

@@ -1270,26 +1270,22 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma;
struct vm_area_struct *near;
struct anon_vma *anon_vma = NULL;
near = vma->vm_next;
if (!near)
goto try_prev;
/* Try next first. */
if (vma->vm_next) {
anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
if (anon_vma)
return anon_vma;
}
anon_vma = reusable_anon_vma(near, vma, near);
if (anon_vma)
return anon_vma;
try_prev:
near = vma->vm_prev;
if (!near)
goto none;
/* Try prev next. */
if (vma->vm_prev)
anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
anon_vma = reusable_anon_vma(near, near, vma);
if (anon_vma)
return anon_vma;
none:
/*
* We might reach here with anon_vma == NULL if we can't find
* any reusable anon_vma.
* There's no absolute need to look only at touching neighbours:
* we could search further afield for "compatible" anon_vmas.
* But it would probably just be a waste of time searching,
@@ -1297,7 +1293,7 @@ none:
* We're trying to allow mprotect remerging later on,
* not trying to minimize memory used for anon_vmas.
*/
return NULL;
return anon_vma;
}
/*

View File

@@ -26,6 +26,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
@@ -620,6 +621,7 @@ static void oom_reap_task(struct task_struct *tsk)
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
sched_show_task(tsk);
debug_show_all_locks();
done:

View File

@@ -5848,6 +5848,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return false;
}
#ifdef CONFIG_SPARSEMEM
/* Skip PFNs that belong to non-present sections */
static inline __meminit unsigned long next_pfn(unsigned long pfn)
{
unsigned long section_nr;
section_nr = pfn_to_section_nr(++pfn);
if (present_section_nr(section_nr))
return pfn;
while (++section_nr <= __highest_present_section_nr) {
if (present_section_nr(section_nr))
return section_nr_to_pfn(section_nr);
}
return -1;
}
#else
static inline __meminit unsigned long next_pfn(unsigned long pfn)
{
return pfn++;
}
#endif
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5887,8 +5911,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
if (!early_pfn_valid(pfn)) {
pfn = next_pfn(pfn) - 1;
continue;
}
if (!early_pfn_in_nid(pfn, nid))
continue;
if (overlap_memmap_init(zone, &pfn))
@@ -8154,20 +8180,22 @@ void *__init alloc_large_system_hash(const char *tablename,
/*
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
* check without lock_page also may miss some movable non-lru pages at
* race condition. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
* dereference that page (e.g., dumping), it has to make sure that that it
* cannot get removed (e.g., via memory unplug) concurrently.
*
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
int migratetype, int flags)
struct page *has_unmovable_pages(struct zone *zone, struct page *page,
int migratetype, int flags)
{
unsigned long found;
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -8184,22 +8212,19 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* so consider them movable here.
*/
if (is_migrate_cma(migratetype))
return false;
return NULL;
reason = "CMA page";
goto unmovable;
return page;
}
for (found = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
if (!pfn_valid_within(check))
for (; iter < pageblock_nr_pages; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(check);
page = pfn_to_page(pfn + iter);
if (PageReserved(page))
goto unmovable;
return page;
/*
* If the zone is movable and we have ruled out all reserved
@@ -8219,7 +8244,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unsigned int skip_pages;
if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
return page;
skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
@@ -8245,11 +8270,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
if (__PageMovable(page))
if (__PageMovable(page) || PageLRU(page))
continue;
if (!PageLRU(page))
found++;
/*
* If there are RECLAIMABLE pages, we need to check
* it. But now, memory offline itself doesn't call
@@ -8263,15 +8286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* is set to both of a memory hole page and a _used_ kernel
* page at boot.
*/
if (found > count)
goto unmovable;
return page;
}
return false;
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
if (flags & REPORT_FAILURE)
dump_page(pfn_to_page(pfn + iter), reason);
return true;
return NULL;
}
#ifdef CONFIG_CONTIG_ALLOC
@@ -8675,10 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
del_page_from_free_area(page, &zone->free_area[order]);
pfn += (1 << order);
}

View File

@@ -17,10 +17,9 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
struct page *unmovable = NULL;
struct zone *zone;
unsigned long flags, pfn;
struct memory_isolate_notify arg;
int notifier_ret;
unsigned long flags;
int ret = -EBUSY;
zone = page_zone(page);
@@ -35,41 +34,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
if (is_migrate_isolate_page(page))
goto out;
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
arg.nr_pages = pageblock_nr_pages;
arg.pages_found = 0;
/*
* It may be possible to isolate a pageblock even if the
* migratetype is not MIGRATE_MOVABLE. The memory isolation
* notifier chain is used by balloon drivers to return the
* number of pages in a range that are held by the balloon
* driver to shrink memory. If all the pages are accounted for
* by balloons, are free, or on the LRU, isolation can continue.
* Later, for example, when memory hotplug notifier runs, these
* pages reported as "can be isolated" should be isolated(freed)
* by the balloon driver through the memory notifier chain.
*/
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
notifier_ret = notifier_to_errno(notifier_ret);
if (notifier_ret)
goto out;
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
isol_flags))
ret = 0;
/*
* immobile means "not-on-lru" pages. If immobile is larger than
* removable-by-driver pages reported by notifier, we'll fail.
*/
out:
if (!ret) {
unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
if (!unmovable) {
unsigned long nr_pages;
int mt = get_pageblock_migratetype(page);
@@ -79,11 +49,24 @@ out:
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
ret = 0;
}
out:
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
if (!ret) {
drain_all_pages(zone);
} else {
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
if ((isol_flags & REPORT_FAILURE) && unmovable)
/*
* printk() with zone->lock held will likely trigger a
* lockdep splat, so defer it here.
*/
dump_page(unmovable, "unmovable page");
}
return ret;
}

View File

@@ -52,12 +52,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
}
static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn)
static inline bool pfn_is_match(struct page *page, unsigned long pfn)
{
unsigned long hpage_pfn = page_to_pfn(hpage);
unsigned long page_pfn = page_to_pfn(page);
/* normal page and hugetlbfs page */
if (!PageTransCompound(page) || PageHuge(page))
return page_pfn == pfn;
/* THP can be referenced by any subpage */
return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage);
return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page);
}
/**
@@ -108,7 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(*pvmw->pte);
}
return pfn_in_hpage(pvmw->page, pfn);
return pfn_is_match(pvmw->page, pfn);
}
/**

View File

@@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages,
if (copy > len)
copy = len;
if (vm_write) {
if (vm_write)
copied = copy_page_from_iter(page, offset, copy, iter);
set_page_dirty_lock(page);
} else {
else
copied = copy_page_to_iter(page, offset, copy, iter);
}
len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
@@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
down_read(&mm->mmap_sem);
pages = get_user_pages_remote(task, mm, pa, pages, flags,
process_pages, NULL, &locked);
pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
up_read(&mm->mmap_sem);
if (pages <= 0)
if (pinned_pages <= 0)
return -EFAULT;
bytes = pages * PAGE_SIZE - start_offset;
bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;
@@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
vm_write);
len -= bytes;
start_offset = 0;
nr_pages -= pages;
pa += pages * PAGE_SIZE;
while (pages)
put_page(process_pages[--pages]);
nr_pages -= pinned_pages;
pa += pinned_pages * PAGE_SIZE;
/* If vm_write is set, the pages need to be made dirty: */
unpin_user_pages_dirty_lock(process_pages, pinned_pages,
vm_write);
}
return rc;

View File

@@ -439,19 +439,38 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
}
#ifdef CONFIG_SLUB_DEBUG
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
/*
* Determine a map of object in use on a page.
*
* Node listlock must be held to guarantee that the page does
* not vanish from under us.
*/
static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
VM_BUG_ON(!irqs_disabled());
spin_lock(&object_map_lock);
bitmap_zero(object_map, page->objects);
for (p = page->freelist; p; p = get_freepointer(s, p))
set_bit(slab_index(p, s, addr), map);
set_bit(slab_index(p, s, addr), object_map);
return object_map;
}
static void put_map(unsigned long *map)
{
VM_BUG_ON(map != object_map);
lockdep_assert_held(&object_map_lock);
spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -3675,13 +3694,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
if (!map)
return;
unsigned long *map;
slab_err(s, page, text, s->name);
slab_lock(page);
get_map(s, page, map);
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3689,8 +3707,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
put_map(map);
slab_unlock(page);
bitmap_free(map);
#endif
}
@@ -4384,19 +4403,19 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
static void validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
static void validate_slab(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
unsigned long *map;
slab_lock(page);
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
return;
goto unlock;
/* Now we know that a valid freelist exists */
bitmap_zero(map, page->objects);
get_map(s, page, map);
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
u8 val = test_bit(slab_index(p, s, addr), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
@@ -4404,18 +4423,13 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
if (!check_object(s, page, p, val))
break;
}
}
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
slab_lock(page);
validate_slab(s, page, map);
put_map(map);
unlock:
slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
struct kmem_cache_node *n, unsigned long *map)
struct kmem_cache_node *n)
{
unsigned long count = 0;
struct page *page;
@@ -4424,7 +4438,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
validate_slab(s, page);
count++;
}
if (count != n->nr_partial)
@@ -4435,7 +4449,7 @@ static int validate_slab_node(struct kmem_cache *s,
goto out;
list_for_each_entry(page, &n->full, slab_list) {
validate_slab_slab(s, page, map);
validate_slab(s, page);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4452,15 +4466,11 @@ static long validate_slab_cache(struct kmem_cache *s)
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map)
return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
count += validate_slab_node(s, n, map);
bitmap_free(map);
count += validate_slab_node(s, n);
return count;
}
/*
@@ -4590,18 +4600,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc,
unsigned long *map)
struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
void *p;
unsigned long *map;
bitmap_zero(map, page->objects);
get_map(s, page, map);
map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
add_location(t, s, get_track(s, p, alloc));
put_map(map);
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -4612,11 +4621,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
bitmap_free(map);
if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4631,9 +4638,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
process_slab(&t, s, page, alloc);
list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
process_slab(&t, s, page, alloc);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4682,7 +4689,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;

View File

@@ -789,7 +789,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
ms->section_mem_map = (unsigned long)NULL;
}
if (section_is_early && memmap)

View File

@@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr)
* processing, and instead, expect a call to
* put_page_testzero().
*/
if (put_devmap_managed_page(page))
if (page_is_devmap_managed(page)) {
put_devmap_managed_page(page);
continue;
}
}
page = compound_head(page);
@@ -1102,3 +1104,26 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
#ifdef CONFIG_DEV_PAGEMAP_OPS
void put_devmap_managed_page(struct page *page)
{
int count;
if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
return;
count = page_ref_dec_return(page);
/*
* devmap page refcounts are 1-based, rather than 0-based: if
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
if (count == 1)
free_devmap_managed_page(page);
else if (!count)
__put_page(page);
}
EXPORT_SYMBOL(put_devmap_managed_page);
#endif

View File

@@ -2737,10 +2737,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
++(*pos);
for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
return si;
}

View File

@@ -146,20 +146,6 @@ struct scan_control {
struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetch(&prev->_field); \
} \
} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
@@ -2695,7 +2681,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
@@ -2874,8 +2860,6 @@ again:
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
return reclaimable;
}
/*
@@ -4126,10 +4110,8 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages

View File

@@ -32,6 +32,7 @@
#include <linux/swapops.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/workqueue.h>
/*********************************
* statistics
@@ -65,6 +66,11 @@ static u64 zswap_reject_kmemcache_fail;
/* Duplicate store was encountered (rare) */
static u64 zswap_duplicate_entry;
/* Shrinker work queue */
static struct workqueue_struct *shrink_wq;
/* Pool limit was hit, we need to calm down */
static bool zswap_pool_reached_full;
/*********************************
* tunables
**********************************/
@@ -109,6 +115,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
/* The threshold for accepting new pages after the max_pool_percent was hit */
static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
/* Enable/disable handling same-value filled pages (enabled by default) */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
@@ -123,7 +134,8 @@ struct zswap_pool {
struct crypto_comp * __percpu *tfm;
struct kref kref;
struct list_head list;
struct work_struct work;
struct work_struct release_work;
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -214,6 +226,13 @@ static bool zswap_is_full(void)
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static bool zswap_can_accept(void)
{
return totalram_pages() * zswap_accept_thr_percent / 100 *
zswap_max_pool_percent / 100 >
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
@@ -501,6 +520,16 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
if (zpool_shrink(pool->zpool, 1, NULL))
zswap_reject_reclaim_fail++;
zswap_pool_put(pool);
}
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
@@ -551,6 +580,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -624,7 +654,8 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
static void __zswap_pool_release(struct work_struct *work)
{
struct zswap_pool *pool = container_of(work, typeof(*pool), work);
struct zswap_pool *pool = container_of(work, typeof(*pool),
release_work);
synchronize_rcu();
@@ -647,8 +678,8 @@ static void __zswap_pool_empty(struct kref *kref)
list_del_rcu(&pool->list);
INIT_WORK(&pool->work, __zswap_pool_release);
schedule_work(&pool->work);
INIT_WORK(&pool->release_work, __zswap_pool_release);
schedule_work(&pool->release_work);
spin_unlock(&zswap_pools_lock);
}
@@ -942,22 +973,6 @@ end:
return ret;
}
static int zswap_shrink(void)
{
struct zswap_pool *pool;
int ret;
pool = zswap_pool_last_get();
if (!pool)
return -ENOENT;
ret = zpool_shrink(pool->zpool, 1, NULL);
zswap_pool_put(pool);
return ret;
}
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned int pos;
@@ -1011,21 +1026,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
if (zswap_shrink()) {
zswap_reject_reclaim_fail++;
ret = -ENOMEM;
goto reject;
}
struct zswap_pool *pool;
/* A second zswap_is_full() check after
* zswap_shrink() to make sure it's now
* under the max_pool_percent
*/
if (zswap_is_full()) {
zswap_pool_limit_hit++;
zswap_pool_reached_full = true;
pool = zswap_pool_last_get();
if (pool)
queue_work(shrink_wq, &pool->shrink_work);
ret = -ENOMEM;
goto reject;
}
if (zswap_pool_reached_full) {
if (!zswap_can_accept()) {
ret = -ENOMEM;
goto reject;
}
} else
zswap_pool_reached_full = false;
}
/* allocate entry */
@@ -1332,11 +1349,18 @@ static int __init init_zswap(void)
zswap_enabled = false;
}
shrink_wq = create_workqueue("zswap-shrink");
if (!shrink_wq)
goto fallback_fail;
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
fallback_fail:
if (pool)
zswap_pool_destroy(pool);
hp_fail:
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail: