Merge branch 'akpm' (patches from Andrew)
Pull updates from Andrew Morton: "Most of -mm and quite a number of other subsystems: hotfixes, scripts, ocfs2, misc, lib, binfmt, init, reiserfs, exec, dma-mapping, kcov. MM is fairly quiet this time. Holidays, I assume" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits) kcov: ignore fault-inject and stacktrace include/linux/io-mapping.h-mapping: use PHYS_PFN() macro in io_mapping_map_atomic_wc() execve: warn if process starts with executable stack reiserfs: prevent NULL pointer dereference in reiserfs_insert_item() init/main.c: fix misleading "This architecture does not have kernel memory protection" message init/main.c: fix quoted value handling in unknown_bootoption init/main.c: remove unnecessary repair_env_string in do_initcall_level init/main.c: log arguments and environment passed to init fs/binfmt_elf.c: coredump: allow process with empty address space to coredump fs/binfmt_elf.c: coredump: delete duplicated overflow check fs/binfmt_elf.c: coredump: allocate core ELF header on stack fs/binfmt_elf.c: make BAD_ADDR() unlikely fs/binfmt_elf.c: better codegen around current->mm fs/binfmt_elf.c: don't copy ELF header around fs/binfmt_elf.c: fix ->start_code calculation fs/binfmt_elf.c: smaller code generation around auxv vector fill lib/find_bit.c: uninline helper _find_next_bit() lib/find_bit.c: join _find_next_bit{_le} uapi: rename ext2_swab() to swab() and share globally in swab.h lib/scatterlist.c: adjust indentation in __sg_alloc_table ...
This commit is contained in:
@@ -20,6 +20,7 @@ KCOV_INSTRUMENT_kmemleak.o := n
|
||||
KCOV_INSTRUMENT_memcontrol.o := n
|
||||
KCOV_INSTRUMENT_mmzone.o := n
|
||||
KCOV_INSTRUMENT_vmstat.o := n
|
||||
KCOV_INSTRUMENT_failslab.o := n
|
||||
|
||||
CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
|
||||
CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
|
||||
|
||||
@@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = {
|
||||
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
|
||||
|
||||
static struct class *bdi_class;
|
||||
const char *bdi_unknown_name = "(unknown)";
|
||||
|
||||
/*
|
||||
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
|
||||
|
||||
16
mm/debug.c
16
mm/debug.c
@@ -46,7 +46,15 @@ void __dump_page(struct page *page, const char *reason)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
bool page_poisoned = PagePoisoned(page);
|
||||
/*
|
||||
* Accessing the pageblock without the zone lock. It could change to
|
||||
* "isolate" again in the meantime, but since we are just dumping the
|
||||
* state for debugging, it should be fine to accept a bit of
|
||||
* inaccuracy here due to racing.
|
||||
*/
|
||||
bool page_cma = is_migrate_cma_page(page);
|
||||
int mapcount;
|
||||
char *type = "";
|
||||
|
||||
/*
|
||||
* If struct page is poisoned don't access Page*() functions as that
|
||||
@@ -78,9 +86,9 @@ void __dump_page(struct page *page, const char *reason)
|
||||
page, page_ref_count(page), mapcount,
|
||||
page->mapping, page_to_pgoff(page));
|
||||
if (PageKsm(page))
|
||||
pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
|
||||
type = "ksm ";
|
||||
else if (PageAnon(page))
|
||||
pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
|
||||
type = "anon ";
|
||||
else if (mapping) {
|
||||
if (mapping->host && mapping->host->i_dentry.first) {
|
||||
struct dentry *dentry;
|
||||
@@ -88,10 +96,12 @@ void __dump_page(struct page *page, const char *reason)
|
||||
pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
|
||||
} else
|
||||
pr_warn("%ps\n", mapping->a_ops);
|
||||
pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
|
||||
}
|
||||
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
|
||||
|
||||
pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
|
||||
page_cma ? " CMA" : "");
|
||||
|
||||
hex_only:
|
||||
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
|
||||
sizeof(unsigned long), page,
|
||||
|
||||
@@ -121,8 +121,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
|
||||
}
|
||||
}
|
||||
|
||||
if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
|
||||
__func__, (u64)phys_addr, size))
|
||||
if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n",
|
||||
__func__, &phys_addr, size))
|
||||
return NULL;
|
||||
|
||||
/* Don't allow wraparound or zero size */
|
||||
@@ -158,8 +158,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
|
||||
--idx;
|
||||
--nrpages;
|
||||
}
|
||||
WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
|
||||
__func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
|
||||
WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
|
||||
__func__, &phys_addr, size, slot, offset, slot_virt[slot]);
|
||||
|
||||
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
|
||||
return prev_map[slot];
|
||||
|
||||
34
mm/filemap.c
34
mm/filemap.c
@@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping)
|
||||
return mapping->nrpages;
|
||||
}
|
||||
|
||||
int filemap_write_and_wait(struct address_space *mapping)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (mapping_needs_writeback(mapping)) {
|
||||
err = filemap_fdatawrite(mapping);
|
||||
/*
|
||||
* Even if the above returned error, the pages may be
|
||||
* written partially (e.g. -ENOSPC), so we wait for it.
|
||||
* But the -EIO is special case, it may indicate the worst
|
||||
* thing (e.g. bug) happened, so we avoid waiting for it.
|
||||
*/
|
||||
if (err != -EIO) {
|
||||
int err2 = filemap_fdatawait(mapping);
|
||||
if (!err)
|
||||
err = err2;
|
||||
} else {
|
||||
/* Clear any previously stored errors */
|
||||
filemap_check_errors(mapping);
|
||||
}
|
||||
} else {
|
||||
err = filemap_check_errors(mapping);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(filemap_write_and_wait);
|
||||
|
||||
/**
|
||||
* filemap_write_and_wait_range - write out & wait on a file range
|
||||
* @mapping: the address_space for the pages
|
||||
@@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
|
||||
if (mapping_needs_writeback(mapping)) {
|
||||
err = __filemap_fdatawrite_range(mapping, lstart, lend,
|
||||
WB_SYNC_ALL);
|
||||
/* See comment of filemap_write_and_wait() */
|
||||
/*
|
||||
* Even if the above returned error, the pages may be
|
||||
* written partially (e.g. -ENOSPC), so we wait for it.
|
||||
* But the -EIO is special case, it may indicate the worst
|
||||
* thing (e.g. bug) happened, so we avoid waiting for it.
|
||||
*/
|
||||
if (err != -EIO) {
|
||||
int err2 = filemap_fdatawait_range(mapping,
|
||||
lstart, lend);
|
||||
|
||||
501
mm/gup.c
501
mm/gup.c
@@ -29,8 +29,23 @@ struct follow_page_context {
|
||||
unsigned int page_mask;
|
||||
};
|
||||
|
||||
/*
|
||||
* Return the compound head page with ref appropriately incremented,
|
||||
* or NULL if that failed.
|
||||
*/
|
||||
static inline struct page *try_get_compound_head(struct page *page, int refs)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
if (WARN_ON_ONCE(page_ref_count(head) < 0))
|
||||
return NULL;
|
||||
if (unlikely(!page_cache_add_speculative(head, refs)))
|
||||
return NULL;
|
||||
return head;
|
||||
}
|
||||
|
||||
/**
|
||||
* put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
|
||||
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
|
||||
* @pages: array of pages to be maybe marked dirty, and definitely released.
|
||||
* @npages: number of pages in the @pages array.
|
||||
* @make_dirty: whether to mark the pages dirty
|
||||
@@ -40,19 +55,19 @@ struct follow_page_context {
|
||||
*
|
||||
* For each page in the @pages array, make that page (or its head page, if a
|
||||
* compound page) dirty, if @make_dirty is true, and if the page was previously
|
||||
* listed as clean. In any case, releases all pages using put_user_page(),
|
||||
* possibly via put_user_pages(), for the non-dirty case.
|
||||
* listed as clean. In any case, releases all pages using unpin_user_page(),
|
||||
* possibly via unpin_user_pages(), for the non-dirty case.
|
||||
*
|
||||
* Please see the put_user_page() documentation for details.
|
||||
* Please see the unpin_user_page() documentation for details.
|
||||
*
|
||||
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
|
||||
* required, then the caller should a) verify that this is really correct,
|
||||
* because _lock() is usually required, and b) hand code it:
|
||||
* set_page_dirty_lock(), put_user_page().
|
||||
* set_page_dirty_lock(), unpin_user_page().
|
||||
*
|
||||
*/
|
||||
void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
bool make_dirty)
|
||||
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
bool make_dirty)
|
||||
{
|
||||
unsigned long index;
|
||||
|
||||
@@ -63,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
*/
|
||||
|
||||
if (!make_dirty) {
|
||||
put_user_pages(pages, npages);
|
||||
unpin_user_pages(pages, npages);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -91,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
|
||||
*/
|
||||
if (!PageDirty(page))
|
||||
set_page_dirty_lock(page);
|
||||
put_user_page(page);
|
||||
unpin_user_page(page);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(put_user_pages_dirty_lock);
|
||||
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
|
||||
|
||||
/**
|
||||
* put_user_pages() - release an array of gup-pinned pages.
|
||||
* unpin_user_pages() - release an array of gup-pinned pages.
|
||||
* @pages: array of pages to be marked dirty and released.
|
||||
* @npages: number of pages in the @pages array.
|
||||
*
|
||||
* For each page in the @pages array, release the page using put_user_page().
|
||||
* For each page in the @pages array, release the page using unpin_user_page().
|
||||
*
|
||||
* Please see the put_user_page() documentation for details.
|
||||
* Please see the unpin_user_page() documentation for details.
|
||||
*/
|
||||
void put_user_pages(struct page **pages, unsigned long npages)
|
||||
void unpin_user_pages(struct page **pages, unsigned long npages)
|
||||
{
|
||||
unsigned long index;
|
||||
|
||||
@@ -115,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages)
|
||||
* single operation to the head page should suffice.
|
||||
*/
|
||||
for (index = 0; index < npages; index++)
|
||||
put_user_page(pages[index]);
|
||||
unpin_user_page(pages[index]);
|
||||
}
|
||||
EXPORT_SYMBOL(put_user_pages);
|
||||
EXPORT_SYMBOL(unpin_user_pages);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
static struct page *no_page_table(struct vm_area_struct *vma,
|
||||
@@ -179,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep, pte;
|
||||
|
||||
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
|
||||
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
|
||||
(FOLL_PIN | FOLL_GET)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
retry:
|
||||
if (unlikely(pmd_bad(*pmd)))
|
||||
return no_page_table(vma, flags);
|
||||
@@ -323,7 +342,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
|
||||
pmdval = READ_ONCE(*pmd);
|
||||
if (pmd_none(pmdval))
|
||||
return no_page_table(vma, flags);
|
||||
if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
|
||||
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
|
||||
page = follow_huge_pmd(mm, address, pmd, flags);
|
||||
if (page)
|
||||
return page;
|
||||
@@ -433,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
|
||||
pud = pud_offset(p4dp, address);
|
||||
if (pud_none(*pud))
|
||||
return no_page_table(vma, flags);
|
||||
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
|
||||
if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
|
||||
page = follow_huge_pud(mm, address, pud, flags);
|
||||
if (page)
|
||||
return page;
|
||||
@@ -796,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
|
||||
start = untagged_addr(start);
|
||||
|
||||
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
|
||||
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
|
||||
|
||||
/*
|
||||
* If FOLL_FORCE is set then do not force a full fault as the hinting
|
||||
@@ -1020,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
|
||||
BUG_ON(*locked != 1);
|
||||
}
|
||||
|
||||
if (pages)
|
||||
/*
|
||||
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
|
||||
* is to set FOLL_GET if the caller wants pages[] filled in (but has
|
||||
* carelessly failed to specify FOLL_GET), so keep doing that, but only
|
||||
* for FOLL_GET, not for the newer FOLL_PIN.
|
||||
*
|
||||
* FOLL_PIN always expects pages to be non-null, but no need to assert
|
||||
* that here, as any failures will be obvious enough.
|
||||
*/
|
||||
if (pages && !(flags & FOLL_PIN))
|
||||
flags |= FOLL_GET;
|
||||
|
||||
pages_done = 0;
|
||||
@@ -1096,88 +1124,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
|
||||
return pages_done;
|
||||
}
|
||||
|
||||
/*
|
||||
* get_user_pages_remote() - pin user pages in memory
|
||||
* @tsk: the task_struct to use for page fault accounting, or
|
||||
* NULL if faults are not to be recorded.
|
||||
* @mm: mm_struct of target mm
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying lookup behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long. Or NULL, if caller
|
||||
* only intends to ensure the pages are faulted in.
|
||||
* @vmas: array of pointers to vmas corresponding to each page.
|
||||
* Or NULL if the caller does not require them.
|
||||
* @locked: pointer to lock flag indicating whether lock is held and
|
||||
* subsequently whether VM_FAULT_RETRY functionality can be
|
||||
* utilised. Lock must initially be held.
|
||||
*
|
||||
* Returns either number of pages pinned (which may be less than the
|
||||
* number requested), or an error. Details about the return value:
|
||||
*
|
||||
* -- If nr_pages is 0, returns 0.
|
||||
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
|
||||
* -- If nr_pages is >0, and some pages were pinned, returns the number of
|
||||
* pages pinned. Again, this may be less than nr_pages.
|
||||
*
|
||||
* The caller is responsible for releasing returned @pages, via put_page().
|
||||
*
|
||||
* @vmas are valid only as long as mmap_sem is held.
|
||||
*
|
||||
* Must be called with mmap_sem held for read or write.
|
||||
*
|
||||
* get_user_pages walks a process's page tables and takes a reference to
|
||||
* each struct page that each user address corresponds to at a given
|
||||
* instant. That is, it takes the page that would be accessed if a user
|
||||
* thread accesses the given user virtual address at that instant.
|
||||
*
|
||||
* This does not guarantee that the page exists in the user mappings when
|
||||
* get_user_pages returns, and there may even be a completely different
|
||||
* page there in some cases (eg. if mmapped pagecache has been invalidated
|
||||
* and subsequently re faulted). However it does guarantee that the page
|
||||
* won't be freed completely. And mostly callers simply care that the page
|
||||
* contains data that was valid *at some point in time*. Typically, an IO
|
||||
* or similar operation cannot guarantee anything stronger anyway because
|
||||
* locks can't be held over the syscall boundary.
|
||||
*
|
||||
* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
|
||||
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
|
||||
* be called after the page is finished with, and before put_page is called.
|
||||
*
|
||||
* get_user_pages is typically used for fewer-copy IO operations, to get a
|
||||
* handle on the memory by some means other than accesses via the user virtual
|
||||
* addresses. The pages may be submitted for DMA to devices or accessed via
|
||||
* their kernel linear mapping (via the kmap APIs). Care should be taken to
|
||||
* use the correct cache flushing APIs.
|
||||
*
|
||||
* See also get_user_pages_fast, for performance critical applications.
|
||||
*
|
||||
* get_user_pages should be phased out in favor of
|
||||
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
|
||||
* should use get_user_pages because it cannot pass
|
||||
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
|
||||
*/
|
||||
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
/*
|
||||
* FIXME: Current FOLL_LONGTERM behavior is incompatible with
|
||||
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
|
||||
* vmas. As there are no users of this flag in this call we simply
|
||||
* disallow this option for now.
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
|
||||
return -EINVAL;
|
||||
|
||||
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
|
||||
locked,
|
||||
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
|
||||
}
|
||||
EXPORT_SYMBOL(get_user_pages_remote);
|
||||
|
||||
/**
|
||||
* populate_vma_page_range() - populate a range of pages in the vma.
|
||||
* @vma: target vma
|
||||
@@ -1611,6 +1557,116 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
|
||||
}
|
||||
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
|
||||
|
||||
/*
|
||||
* get_user_pages_remote() - pin user pages in memory
|
||||
* @tsk: the task_struct to use for page fault accounting, or
|
||||
* NULL if faults are not to be recorded.
|
||||
* @mm: mm_struct of target mm
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying lookup behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long. Or NULL, if caller
|
||||
* only intends to ensure the pages are faulted in.
|
||||
* @vmas: array of pointers to vmas corresponding to each page.
|
||||
* Or NULL if the caller does not require them.
|
||||
* @locked: pointer to lock flag indicating whether lock is held and
|
||||
* subsequently whether VM_FAULT_RETRY functionality can be
|
||||
* utilised. Lock must initially be held.
|
||||
*
|
||||
* Returns either number of pages pinned (which may be less than the
|
||||
* number requested), or an error. Details about the return value:
|
||||
*
|
||||
* -- If nr_pages is 0, returns 0.
|
||||
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
|
||||
* -- If nr_pages is >0, and some pages were pinned, returns the number of
|
||||
* pages pinned. Again, this may be less than nr_pages.
|
||||
*
|
||||
* The caller is responsible for releasing returned @pages, via put_page().
|
||||
*
|
||||
* @vmas are valid only as long as mmap_sem is held.
|
||||
*
|
||||
* Must be called with mmap_sem held for read or write.
|
||||
*
|
||||
* get_user_pages walks a process's page tables and takes a reference to
|
||||
* each struct page that each user address corresponds to at a given
|
||||
* instant. That is, it takes the page that would be accessed if a user
|
||||
* thread accesses the given user virtual address at that instant.
|
||||
*
|
||||
* This does not guarantee that the page exists in the user mappings when
|
||||
* get_user_pages returns, and there may even be a completely different
|
||||
* page there in some cases (eg. if mmapped pagecache has been invalidated
|
||||
* and subsequently re faulted). However it does guarantee that the page
|
||||
* won't be freed completely. And mostly callers simply care that the page
|
||||
* contains data that was valid *at some point in time*. Typically, an IO
|
||||
* or similar operation cannot guarantee anything stronger anyway because
|
||||
* locks can't be held over the syscall boundary.
|
||||
*
|
||||
* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
|
||||
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
|
||||
* be called after the page is finished with, and before put_page is called.
|
||||
*
|
||||
* get_user_pages is typically used for fewer-copy IO operations, to get a
|
||||
* handle on the memory by some means other than accesses via the user virtual
|
||||
* addresses. The pages may be submitted for DMA to devices or accessed via
|
||||
* their kernel linear mapping (via the kmap APIs). Care should be taken to
|
||||
* use the correct cache flushing APIs.
|
||||
*
|
||||
* See also get_user_pages_fast, for performance critical applications.
|
||||
*
|
||||
* get_user_pages should be phased out in favor of
|
||||
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
|
||||
* should use get_user_pages because it cannot pass
|
||||
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
|
||||
*/
|
||||
#ifdef CONFIG_MMU
|
||||
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that with an assertion:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Parts of FOLL_LONGTERM behavior are incompatible with
|
||||
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
|
||||
* vmas. However, this only comes up if locked is set, and there are
|
||||
* callers that do request FOLL_LONGTERM, but do not set locked. So,
|
||||
* allow what we can.
|
||||
*/
|
||||
if (gup_flags & FOLL_LONGTERM) {
|
||||
if (WARN_ON_ONCE(locked))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* This will check the vmas (even if our vmas arg is NULL)
|
||||
* and return -ENOTSUPP if DAX isn't allowed in this case:
|
||||
*/
|
||||
return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
|
||||
vmas, gup_flags | FOLL_TOUCH |
|
||||
FOLL_REMOTE);
|
||||
}
|
||||
|
||||
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
|
||||
locked,
|
||||
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
|
||||
}
|
||||
EXPORT_SYMBOL(get_user_pages_remote);
|
||||
|
||||
#else /* CONFIG_MMU */
|
||||
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* !CONFIG_MMU */
|
||||
|
||||
/*
|
||||
* This is the same as get_user_pages_remote(), just with a
|
||||
* less-flexible calling convention where we assume that the task
|
||||
@@ -1622,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that with an assertion:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
return __gup_longterm_locked(current, current->mm, start, nr_pages,
|
||||
pages, vmas, gup_flags | FOLL_TOUCH);
|
||||
}
|
||||
@@ -1807,20 +1870,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the compund head page with ref appropriately incremented,
|
||||
* or NULL if that failed.
|
||||
*/
|
||||
static inline struct page *try_get_compound_head(struct page *page, int refs)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
if (WARN_ON_ONCE(page_ref_count(head) < 0))
|
||||
return NULL;
|
||||
if (unlikely(!page_cache_add_speculative(head, refs)))
|
||||
return NULL;
|
||||
return head;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
|
||||
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
|
||||
unsigned int flags, struct page **pages, int *nr)
|
||||
@@ -1978,6 +2027,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
|
||||
}
|
||||
#endif
|
||||
|
||||
static int record_subpages(struct page *page, unsigned long addr,
|
||||
unsigned long end, struct page **pages)
|
||||
{
|
||||
int nr;
|
||||
|
||||
for (nr = 0; addr != end; addr += PAGE_SIZE)
|
||||
pages[nr++] = page++;
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
static void put_compound_head(struct page *page, int refs)
|
||||
{
|
||||
VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
|
||||
/*
|
||||
* Calling put_page() for each ref is unnecessarily slow. Only the last
|
||||
* ref needs a put_page().
|
||||
*/
|
||||
if (refs > 1)
|
||||
page_ref_sub(page, refs - 1);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
|
||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
|
||||
unsigned long sz)
|
||||
@@ -2007,32 +2079,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
|
||||
refs = 0;
|
||||
head = pte_page(pte);
|
||||
|
||||
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
|
||||
do {
|
||||
VM_BUG_ON(compound_head(page) != head);
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_get_compound_head(head, refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
if (!head)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
||||
/* Could be optimized better */
|
||||
*nr -= refs;
|
||||
while (refs--)
|
||||
put_page(head);
|
||||
put_compound_head(head, refs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
return 1;
|
||||
}
|
||||
@@ -2079,28 +2139,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
|
||||
}
|
||||
|
||||
refs = 0;
|
||||
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
do {
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_get_compound_head(pmd_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
if (!head)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
|
||||
*nr -= refs;
|
||||
while (refs--)
|
||||
put_page(head);
|
||||
put_compound_head(head, refs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
return 1;
|
||||
}
|
||||
@@ -2120,28 +2171,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
|
||||
}
|
||||
|
||||
refs = 0;
|
||||
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
do {
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_get_compound_head(pud_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
if (!head)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
|
||||
*nr -= refs;
|
||||
while (refs--)
|
||||
put_page(head);
|
||||
put_compound_head(head, refs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
return 1;
|
||||
}
|
||||
@@ -2157,28 +2199,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
|
||||
return 0;
|
||||
|
||||
BUILD_BUG_ON(pgd_devmap(orig));
|
||||
refs = 0;
|
||||
|
||||
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
|
||||
do {
|
||||
pages[*nr] = page;
|
||||
(*nr)++;
|
||||
page++;
|
||||
refs++;
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
refs = record_subpages(page, addr, end, pages + *nr);
|
||||
|
||||
head = try_get_compound_head(pgd_page(orig), refs);
|
||||
if (!head) {
|
||||
*nr -= refs;
|
||||
if (!head)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
|
||||
*nr -= refs;
|
||||
while (refs--)
|
||||
put_page(head);
|
||||
put_compound_head(head, refs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
SetPageReferenced(head);
|
||||
return 1;
|
||||
}
|
||||
@@ -2237,7 +2271,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
|
||||
pud_t pud = READ_ONCE(*pudp);
|
||||
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none(pud))
|
||||
if (unlikely(!pud_present(pud)))
|
||||
return 0;
|
||||
if (unlikely(pud_huge(pud))) {
|
||||
if (!gup_huge_pud(pud, pudp, addr, next, flags,
|
||||
@@ -2393,29 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_user_pages_fast() - pin user pages in memory
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying pin behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long.
|
||||
*
|
||||
* Attempt to pin user pages in memory without taking mm->mmap_sem.
|
||||
* If not successful, it will fall back to taking the lock and
|
||||
* calling get_user_pages().
|
||||
*
|
||||
* Returns number of pages pinned. This may be fewer than the number
|
||||
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
||||
* were pinned, returns -errno.
|
||||
*/
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags,
|
||||
struct page **pages)
|
||||
{
|
||||
unsigned long addr, len, end;
|
||||
int nr = 0, ret = 0;
|
||||
|
||||
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
|
||||
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
|
||||
FOLL_FORCE | FOLL_PIN)))
|
||||
return -EINVAL;
|
||||
|
||||
start = untagged_addr(start) & PAGE_MASK;
|
||||
@@ -2455,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_user_pages_fast() - pin user pages in memory
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying pin behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long.
|
||||
*
|
||||
* Attempt to pin user pages in memory without taking mm->mmap_sem.
|
||||
* If not successful, it will fall back to taking the lock and
|
||||
* calling get_user_pages().
|
||||
*
|
||||
* Returns number of pages pinned. This may be fewer than the number requested.
|
||||
* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
|
||||
* -errno.
|
||||
*/
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
||||
|
||||
/**
|
||||
* pin_user_pages_fast() - pin user pages in memory without taking locks
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages_fast().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
int pin_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages_fast(start, nr_pages, gup_flags, pages);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
|
||||
|
||||
/**
|
||||
* pin_user_pages_remote() - pin pages of a remote process (task != current)
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages_remote().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
|
||||
vmas, locked);
|
||||
}
|
||||
EXPORT_SYMBOL(pin_user_pages_remote);
|
||||
|
||||
/**
|
||||
* pin_user_pages() - pin user pages in memory for use by other devices
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
long pin_user_pages(unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
|
||||
}
|
||||
EXPORT_SYMBOL(pin_user_pages);
|
||||
|
||||
@@ -49,18 +49,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
|
||||
nr = (next - addr) / PAGE_SIZE;
|
||||
}
|
||||
|
||||
/* Filter out most gup flags: only allow a tiny subset here: */
|
||||
gup->flags &= FOLL_WRITE;
|
||||
|
||||
switch (cmd) {
|
||||
case GUP_FAST_BENCHMARK:
|
||||
nr = get_user_pages_fast(addr, nr, gup->flags & 1,
|
||||
nr = get_user_pages_fast(addr, nr, gup->flags,
|
||||
pages + i);
|
||||
break;
|
||||
case GUP_LONGTERM_BENCHMARK:
|
||||
nr = get_user_pages(addr, nr,
|
||||
(gup->flags & 1) | FOLL_LONGTERM,
|
||||
gup->flags | FOLL_LONGTERM,
|
||||
pages + i, NULL);
|
||||
break;
|
||||
case GUP_BENCHMARK:
|
||||
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
|
||||
nr = get_user_pages(addr, nr, gup->flags, pages + i,
|
||||
NULL);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -177,16 +177,13 @@ static ssize_t enabled_store(struct kobject *kobj,
|
||||
{
|
||||
ssize_t ret = count;
|
||||
|
||||
if (!memcmp("always", buf,
|
||||
min(sizeof("always")-1, count))) {
|
||||
if (sysfs_streq(buf, "always")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("madvise", buf,
|
||||
min(sizeof("madvise")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "madvise")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("never", buf,
|
||||
min(sizeof("never")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "never")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
} else
|
||||
@@ -250,32 +247,27 @@ static ssize_t defrag_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
if (!memcmp("always", buf,
|
||||
min(sizeof("always")-1, count))) {
|
||||
if (sysfs_streq(buf, "always")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("defer+madvise", buf,
|
||||
min(sizeof("defer+madvise")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "defer+madvise")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("defer", buf,
|
||||
min(sizeof("defer")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "defer")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("madvise", buf,
|
||||
min(sizeof("madvise")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "madvise")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
|
||||
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
|
||||
} else if (!memcmp("never", buf,
|
||||
min(sizeof("never")-1, count))) {
|
||||
} else if (sysfs_streq(buf, "never")) {
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
|
||||
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
|
||||
@@ -2715,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(page);
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(head);
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct address_space *mapping = NULL;
|
||||
int count, mapcount, extra_pins, ret;
|
||||
@@ -2723,11 +2715,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
unsigned long flags;
|
||||
pgoff_t end;
|
||||
|
||||
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(!PageCompound(page), page);
|
||||
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
|
||||
VM_BUG_ON_PAGE(!PageLocked(head), head);
|
||||
VM_BUG_ON_PAGE(!PageCompound(head), head);
|
||||
|
||||
if (PageWriteback(page))
|
||||
if (PageWriteback(head))
|
||||
return -EBUSY;
|
||||
|
||||
if (PageAnon(head)) {
|
||||
@@ -2778,7 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
mlocked = PageMlocked(page);
|
||||
mlocked = PageMlocked(head);
|
||||
unmap_page(head);
|
||||
VM_BUG_ON_PAGE(compound_mapcount(head), head);
|
||||
|
||||
@@ -2810,14 +2802,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
ds_queue->split_queue_len--;
|
||||
list_del(page_deferred_list(head));
|
||||
}
|
||||
spin_unlock(&ds_queue->split_queue_lock);
|
||||
if (mapping) {
|
||||
if (PageSwapBacked(page))
|
||||
__dec_node_page_state(page, NR_SHMEM_THPS);
|
||||
if (PageSwapBacked(head))
|
||||
__dec_node_page_state(head, NR_SHMEM_THPS);
|
||||
else
|
||||
__dec_node_page_state(page, NR_FILE_THPS);
|
||||
__dec_node_page_state(head, NR_FILE_THPS);
|
||||
}
|
||||
|
||||
spin_unlock(&ds_queue->split_queue_lock);
|
||||
__split_huge_page(page, list, end, flags);
|
||||
if (PageSwapCache(head)) {
|
||||
swp_entry_t entry = { .val = page_private(head) };
|
||||
|
||||
112
mm/kmemleak.c
112
mm/kmemleak.c
@@ -13,7 +13,7 @@
|
||||
*
|
||||
* The following locks and mutexes are used by kmemleak:
|
||||
*
|
||||
* - kmemleak_lock (rwlock): protects the object_list modifications and
|
||||
* - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
|
||||
* accesses to the object_tree_root. The object_list is the main list
|
||||
* holding the metadata (struct kmemleak_object) for the allocated memory
|
||||
* blocks. The object_tree_root is a red black tree used to look-up
|
||||
@@ -22,13 +22,13 @@
|
||||
* object_tree_root in the create_object() function called from the
|
||||
* kmemleak_alloc() callback and removed in delete_object() called from the
|
||||
* kmemleak_free() callback
|
||||
* - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
|
||||
* the metadata (e.g. count) are protected by this lock. Note that some
|
||||
* members of this structure may be protected by other means (atomic or
|
||||
* kmemleak_lock). This lock is also held when scanning the corresponding
|
||||
* memory block to avoid the kernel freeing it via the kmemleak_free()
|
||||
* callback. This is less heavyweight than holding a global lock like
|
||||
* kmemleak_lock during scanning
|
||||
* - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
|
||||
* Accesses to the metadata (e.g. count) are protected by this lock. Note
|
||||
* that some members of this structure may be protected by other means
|
||||
* (atomic or kmemleak_lock). This lock is also held when scanning the
|
||||
* corresponding memory block to avoid the kernel freeing it via the
|
||||
* kmemleak_free() callback. This is less heavyweight than holding a global
|
||||
* lock like kmemleak_lock during scanning.
|
||||
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
|
||||
* unreferenced objects at a time. The gray_list contains the objects which
|
||||
* are already referenced or marked as false positives and need to be
|
||||
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
|
||||
* (use_count) and freed using the RCU mechanism.
|
||||
*/
|
||||
struct kmemleak_object {
|
||||
spinlock_t lock;
|
||||
raw_spinlock_t lock;
|
||||
unsigned int flags; /* object status flags */
|
||||
struct list_head object_list;
|
||||
struct list_head gray_list;
|
||||
@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
|
||||
static LIST_HEAD(mem_pool_free_list);
|
||||
/* search tree for object boundaries */
|
||||
static struct rb_root object_tree_root = RB_ROOT;
|
||||
/* rw_lock protecting the access to object_list and object_tree_root */
|
||||
static DEFINE_RWLOCK(kmemleak_lock);
|
||||
/* protecting the access to object_list and object_tree_root */
|
||||
static DEFINE_RAW_SPINLOCK(kmemleak_lock);
|
||||
|
||||
/* allocation caches for kmemleak internal data */
|
||||
static struct kmem_cache *object_cache;
|
||||
@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
|
||||
}
|
||||
|
||||
/* slab allocation failed, try the memory pool */
|
||||
write_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
object = list_first_entry_or_null(&mem_pool_free_list,
|
||||
typeof(*object), object_list);
|
||||
if (object)
|
||||
@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
|
||||
object = &mem_pool[--mem_pool_free_count];
|
||||
else
|
||||
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
|
||||
write_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
|
||||
return object;
|
||||
}
|
||||
@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemleak_object *object)
|
||||
}
|
||||
|
||||
/* add the object to the memory pool free list */
|
||||
write_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
list_add(&object->object_list, &mem_pool_free_list);
|
||||
write_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
|
||||
struct kmemleak_object *object;
|
||||
|
||||
rcu_read_lock();
|
||||
read_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
object = lookup_object(ptr, alias);
|
||||
read_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
|
||||
/* check whether the object is still available */
|
||||
if (object && !get_object(object))
|
||||
@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
|
||||
unsigned long flags;
|
||||
struct kmemleak_object *object;
|
||||
|
||||
write_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
object = lookup_object(ptr, alias);
|
||||
if (object)
|
||||
__remove_object(object);
|
||||
write_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
|
||||
return object;
|
||||
}
|
||||
@@ -585,7 +585,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
|
||||
INIT_LIST_HEAD(&object->object_list);
|
||||
INIT_LIST_HEAD(&object->gray_list);
|
||||
INIT_HLIST_HEAD(&object->area_list);
|
||||
spin_lock_init(&object->lock);
|
||||
raw_spin_lock_init(&object->lock);
|
||||
atomic_set(&object->use_count, 1);
|
||||
object->flags = OBJECT_ALLOCATED;
|
||||
object->pointer = ptr;
|
||||
@@ -617,7 +617,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
|
||||
/* kernel backtrace */
|
||||
object->trace_len = __save_stack_trace(object->trace);
|
||||
|
||||
write_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
|
||||
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
|
||||
min_addr = min(min_addr, untagged_ptr);
|
||||
@@ -649,7 +649,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
|
||||
|
||||
list_add_tail_rcu(&object->object_list, &object_list);
|
||||
out:
|
||||
write_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
return object;
|
||||
}
|
||||
|
||||
@@ -667,9 +667,9 @@ static void __delete_object(struct kmemleak_object *object)
|
||||
* Locking here also ensures that the corresponding memory block
|
||||
* cannot be freed when it is being scanned.
|
||||
*/
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
object->flags &= ~OBJECT_ALLOCATED;
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
put_object(object);
|
||||
}
|
||||
|
||||
@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_object *object, int color)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
__paint_it(object, color);
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
|
||||
static void paint_ptr(unsigned long ptr, int color)
|
||||
@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
|
||||
if (scan_area_cache)
|
||||
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if (!area) {
|
||||
pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
|
||||
/* mark the object for full scan to avoid false positives */
|
||||
@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
|
||||
|
||||
hlist_add_head(&area->node, &object->area_list);
|
||||
out_unlock:
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
put_object(object);
|
||||
}
|
||||
|
||||
@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
object->excess_ref = excess_ref;
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
put_object(object);
|
||||
}
|
||||
|
||||
@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long ptr)
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
object->flags |= OBJECT_NO_SCAN;
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
put_object(object);
|
||||
}
|
||||
|
||||
@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const void *ptr)
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
object->trace_len = __save_stack_trace(object->trace);
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
|
||||
put_object(object);
|
||||
}
|
||||
@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, void *_end,
|
||||
unsigned long flags;
|
||||
unsigned long untagged_ptr;
|
||||
|
||||
read_lock_irqsave(&kmemleak_lock, flags);
|
||||
raw_spin_lock_irqsave(&kmemleak_lock, flags);
|
||||
for (ptr = start; ptr < end; ptr++) {
|
||||
struct kmemleak_object *object;
|
||||
unsigned long pointer;
|
||||
@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, void *_end,
|
||||
* previously acquired in scan_object(). These locks are
|
||||
* enclosed by scan_mutex.
|
||||
*/
|
||||
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
|
||||
raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
|
||||
/* only pass surplus references (object already gray) */
|
||||
if (color_gray(object)) {
|
||||
excess_ref = object->excess_ref;
|
||||
@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, void *_end,
|
||||
excess_ref = 0;
|
||||
update_refs(object);
|
||||
}
|
||||
spin_unlock(&object->lock);
|
||||
raw_spin_unlock(&object->lock);
|
||||
|
||||
if (excess_ref) {
|
||||
object = lookup_object(excess_ref, 0);
|
||||
@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, void *_end,
|
||||
if (object == scanned)
|
||||
/* circular reference, ignore */
|
||||
continue;
|
||||
spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
|
||||
raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
|
||||
update_refs(object);
|
||||
spin_unlock(&object->lock);
|
||||
raw_spin_unlock(&object->lock);
|
||||
}
|
||||
}
|
||||
read_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_object *object)
|
||||
* Once the object->lock is acquired, the corresponding memory block
|
||||
* cannot be freed (the same lock is acquired in delete_object).
|
||||
*/
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if (object->flags & OBJECT_NO_SCAN)
|
||||
goto out;
|
||||
if (!(object->flags & OBJECT_ALLOCATED))
|
||||
@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_object *object)
|
||||
if (start >= end)
|
||||
break;
|
||||
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
cond_resched();
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
} while (object->flags & OBJECT_ALLOCATED);
|
||||
} else
|
||||
hlist_for_each_entry(area, &object->area_list, node)
|
||||
@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_object *object)
|
||||
(void *)(area->start + area->size),
|
||||
object);
|
||||
out:
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void)
|
||||
/* prepare the kmemleak_object's */
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(object, &object_list, object_list) {
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
#ifdef DEBUG
|
||||
/*
|
||||
* With a few exceptions there should be a maximum of
|
||||
@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void)
|
||||
if (color_gray(object) && get_object(object))
|
||||
list_add_tail(&object->gray_list, &gray_list);
|
||||
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void)
|
||||
*/
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(object, &object_list, object_list) {
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
|
||||
&& update_checksum(object) && get_object(object)) {
|
||||
/* color it gray temporarily */
|
||||
object->count = object->min_count;
|
||||
list_add_tail(&object->gray_list, &gray_list);
|
||||
}
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void)
|
||||
*/
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(object, &object_list, object_list) {
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if (unreferenced_object(object) &&
|
||||
!(object->flags & OBJECT_REPORTED)) {
|
||||
object->flags |= OBJECT_REPORTED;
|
||||
@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void)
|
||||
|
||||
new_leaks++;
|
||||
}
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
|
||||
struct kmemleak_object *object = v;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
|
||||
print_unreferenced(seq, object);
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const char *str)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
dump_object_info(object);
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
|
||||
put_object(object);
|
||||
return 0;
|
||||
@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void)
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(object, &object_list, object_list) {
|
||||
spin_lock_irqsave(&object->lock, flags);
|
||||
raw_spin_lock_irqsave(&object->lock, flags);
|
||||
if ((object->flags & OBJECT_REPORTED) &&
|
||||
unreferenced_object(object))
|
||||
__paint_it(object, KMEMLEAK_GREY);
|
||||
spin_unlock_irqrestore(&object->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&object->lock, flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
|
||||
@@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
|
||||
* Return:
|
||||
* 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
static int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
phys_addr_t base, phys_addr_t size,
|
||||
int nid, enum memblock_flags flags)
|
||||
{
|
||||
@@ -694,7 +694,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
|
||||
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
||||
&base, &end, (void *)_RET_IP_);
|
||||
|
||||
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
|
||||
@@ -795,7 +795,7 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
|
||||
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
||||
&base, &end, (void *)_RET_IP_);
|
||||
|
||||
return memblock_remove_range(&memblock.memory, base, size);
|
||||
@@ -813,7 +813,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
|
||||
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
||||
&base, &end, (void *)_RET_IP_);
|
||||
|
||||
kmemleak_free_part_phys(base, size);
|
||||
@@ -824,12 +824,24 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
|
||||
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
||||
&base, &end, (void *)_RET_IP_);
|
||||
|
||||
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
|
||||
int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
|
||||
&base, &end, (void *)_RET_IP_);
|
||||
|
||||
return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* memblock_setclr_flag - set or clear flag for a memory region
|
||||
* @base: base address of the region
|
||||
|
||||
@@ -5340,14 +5340,6 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
if (compound && !list_empty(page_deferred_list(page))) {
|
||||
spin_lock(&from->deferred_split_queue.split_queue_lock);
|
||||
list_del_init(page_deferred_list(page));
|
||||
from->deferred_split_queue.split_queue_len--;
|
||||
spin_unlock(&from->deferred_split_queue.split_queue_lock);
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
* It is safe to change page->mem_cgroup here because the page
|
||||
* is referenced, charged, and isolated - we can't race with
|
||||
@@ -5357,16 +5349,6 @@ static int mem_cgroup_move_account(struct page *page,
|
||||
/* caller should have done css_get */
|
||||
page->mem_cgroup = to;
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
if (compound && list_empty(page_deferred_list(page))) {
|
||||
spin_lock(&to->deferred_split_queue.split_queue_lock);
|
||||
list_add_tail(page_deferred_list(page),
|
||||
&to->deferred_split_queue.split_queue);
|
||||
to->deferred_split_queue.split_queue_len++;
|
||||
spin_unlock(&to->deferred_split_queue.split_queue_lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
spin_unlock_irqrestore(&from->move_lock, flags);
|
||||
|
||||
ret = 0;
|
||||
@@ -6651,7 +6633,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
unsigned int nr_pages;
|
||||
bool compound;
|
||||
unsigned long flags;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
|
||||
@@ -6673,8 +6654,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
||||
return;
|
||||
|
||||
/* Force-charge the new page. The old one will be freed soon */
|
||||
compound = PageTransHuge(newpage);
|
||||
nr_pages = compound ? hpage_nr_pages(newpage) : 1;
|
||||
nr_pages = hpage_nr_pages(newpage);
|
||||
|
||||
page_counter_charge(&memcg->memory, nr_pages);
|
||||
if (do_memsw_account())
|
||||
@@ -6684,7 +6664,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
|
||||
commit_charge(newpage, memcg, false);
|
||||
|
||||
local_irq_save(flags);
|
||||
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
|
||||
mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
|
||||
nr_pages);
|
||||
memcg_check_events(memcg, newpage);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
@@ -783,27 +783,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
||||
return default_zone_for_pfn(nid, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
|
||||
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
|
||||
int online_type, int nid)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long onlined_pages = 0;
|
||||
struct zone *zone;
|
||||
int need_zonelists_rebuild = 0;
|
||||
int nid;
|
||||
int ret;
|
||||
struct memory_notify arg;
|
||||
struct memory_block *mem;
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
/*
|
||||
* We can't use pfn_to_nid() because nid might be stored in struct page
|
||||
* which is not yet initialized. Instead, we find nid from memory block.
|
||||
*/
|
||||
mem = find_memory_block(__pfn_to_section(pfn));
|
||||
nid = mem->nid;
|
||||
put_device(&mem->dev);
|
||||
|
||||
/* associate pfn range with the zone */
|
||||
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
|
||||
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
|
||||
@@ -1182,7 +1173,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
|
||||
if (!zone_spans_pfn(zone, pfn))
|
||||
return false;
|
||||
|
||||
return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
|
||||
return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
|
||||
MEMORY_OFFLINE);
|
||||
}
|
||||
|
||||
@@ -1764,8 +1755,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
||||
|
||||
BUG_ON(check_hotplug_memory_range(start, size));
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
/*
|
||||
* All memory blocks must be offlined before removing memory. Check
|
||||
* whether all memory blocks in question are offline and return error
|
||||
@@ -1778,9 +1767,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
|
||||
/* remove memmap entry */
|
||||
firmware_map_remove(start, start + size, "System RAM");
|
||||
|
||||
/* remove memory block devices before removing memory */
|
||||
/*
|
||||
* Memory block device removal under the device_hotplug_lock is
|
||||
* a barrier against racing online attempts.
|
||||
*/
|
||||
remove_memory_block_devices(start, size);
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
arch_remove_memory(nid, start, size, NULL);
|
||||
memblock_free(start, size);
|
||||
memblock_remove(start, size);
|
||||
|
||||
@@ -2821,6 +2821,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
|
||||
char *flags = strchr(str, '=');
|
||||
int err = 1, mode;
|
||||
|
||||
if (flags)
|
||||
*flags++ = '\0'; /* terminate mode string */
|
||||
|
||||
if (nodelist) {
|
||||
/* NUL-terminate mode or flags string */
|
||||
*nodelist++ = '\0';
|
||||
@@ -2831,9 +2834,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
|
||||
} else
|
||||
nodes_clear(nodes);
|
||||
|
||||
if (flags)
|
||||
*flags++ = '\0'; /* terminate mode string */
|
||||
|
||||
mode = match_string(policy_modes, MPOL_MAX, str);
|
||||
if (mode < 0)
|
||||
goto out;
|
||||
|
||||
@@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void)
|
||||
|
||||
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (!pgmap->ops || !pgmap->ops->page_free) {
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
|
||||
(!pgmap->ops || !pgmap->ops->page_free)) {
|
||||
WARN(1, "Missing page_free method\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -410,48 +411,42 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
||||
EXPORT_SYMBOL_GPL(get_dev_pagemap);
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void __put_devmap_managed_page(struct page *page)
|
||||
void free_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count = page_ref_dec_return(page);
|
||||
/* notify page idle for dax */
|
||||
if (!is_device_private_page(page)) {
|
||||
wake_up_var(&page->_refcount);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Clear Active bit in case of parallel mark_page_accessed */
|
||||
__ClearPageActive(page);
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
mem_cgroup_uncharge(page);
|
||||
|
||||
/*
|
||||
* If refcount is 1 then page is freed and refcount is stable as nobody
|
||||
* holds a reference on the page.
|
||||
* When a device_private page is freed, the page->mapping field
|
||||
* may still contain a (stale) mapping value. For example, the
|
||||
* lower bits of page->mapping may still identify the page as an
|
||||
* anonymous page. Ultimately, this entire field is just stale
|
||||
* and wrong, and it will cause errors if not cleared. One
|
||||
* example is:
|
||||
*
|
||||
* migrate_vma_pages()
|
||||
* migrate_vma_insert_page()
|
||||
* page_add_new_anon_rmap()
|
||||
* __page_set_anon_rmap()
|
||||
* ...checks page->mapping, via PageAnon(page) call,
|
||||
* and incorrectly concludes that the page is an
|
||||
* anonymous page. Therefore, it incorrectly,
|
||||
* silently fails to set up the new anon rmap.
|
||||
*
|
||||
* For other types of ZONE_DEVICE pages, migration is either
|
||||
* handled differently or not done at all, so there is no need
|
||||
* to clear page->mapping.
|
||||
*/
|
||||
if (count == 1) {
|
||||
/* Clear Active bit in case of parallel mark_page_accessed */
|
||||
__ClearPageActive(page);
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
mem_cgroup_uncharge(page);
|
||||
|
||||
/*
|
||||
* When a device_private page is freed, the page->mapping field
|
||||
* may still contain a (stale) mapping value. For example, the
|
||||
* lower bits of page->mapping may still identify the page as
|
||||
* an anonymous page. Ultimately, this entire field is just
|
||||
* stale and wrong, and it will cause errors if not cleared.
|
||||
* One example is:
|
||||
*
|
||||
* migrate_vma_pages()
|
||||
* migrate_vma_insert_page()
|
||||
* page_add_new_anon_rmap()
|
||||
* __page_set_anon_rmap()
|
||||
* ...checks page->mapping, via PageAnon(page) call,
|
||||
* and incorrectly concludes that the page is an
|
||||
* anonymous page. Therefore, it incorrectly,
|
||||
* silently fails to set up the new anon rmap.
|
||||
*
|
||||
* For other types of ZONE_DEVICE pages, migration is either
|
||||
* handled differently or not done at all, so there is no need
|
||||
* to clear page->mapping.
|
||||
*/
|
||||
if (is_device_private_page(page))
|
||||
page->mapping = NULL;
|
||||
|
||||
page->pgmap->ops->page_free(page);
|
||||
} else if (!count)
|
||||
__put_page(page);
|
||||
page->mapping = NULL;
|
||||
page->pgmap->ops->page_free(page);
|
||||
}
|
||||
EXPORT_SYMBOL(__put_devmap_managed_page);
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
77
mm/migrate.c
77
mm/migrate.c
@@ -48,6 +48,7 @@
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/oom.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -986,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
|
||||
}
|
||||
|
||||
/*
|
||||
* Anonymous and movable page->mapping will be cleard by
|
||||
* Anonymous and movable page->mapping will be cleared by
|
||||
* free_pages_prepare so don't reset it here for keeping
|
||||
* the type to work PageAnon, for example.
|
||||
*/
|
||||
@@ -1199,8 +1200,7 @@ out:
|
||||
/*
|
||||
* A page that has been migrated has all references
|
||||
* removed and will be freed. A page that has not been
|
||||
* migrated will have kepts its references and be
|
||||
* restored.
|
||||
* migrated will have kept its references and be restored.
|
||||
*/
|
||||
list_del(&page->lru);
|
||||
|
||||
@@ -1627,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
|
||||
start = i;
|
||||
} else if (node != current_node) {
|
||||
err = do_move_pages_to_node(mm, &pagelist, current_node);
|
||||
if (err)
|
||||
if (err) {
|
||||
/*
|
||||
* Positive err means the number of failed
|
||||
* pages to migrate. Since we are going to
|
||||
* abort and return the number of non-migrated
|
||||
* pages, so need to incude the rest of the
|
||||
* nr_pages that have not been attempted as
|
||||
* well.
|
||||
*/
|
||||
if (err > 0)
|
||||
err += nr_pages - i - 1;
|
||||
goto out;
|
||||
}
|
||||
err = store_status(status, start, current_node, i - start);
|
||||
if (err)
|
||||
goto out;
|
||||
@@ -1659,8 +1670,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
|
||||
goto out_flush;
|
||||
|
||||
err = do_move_pages_to_node(mm, &pagelist, current_node);
|
||||
if (err)
|
||||
if (err) {
|
||||
if (err > 0)
|
||||
err += nr_pages - i - 1;
|
||||
goto out;
|
||||
}
|
||||
if (i > start) {
|
||||
err = store_status(status, start, current_node, i - start);
|
||||
if (err)
|
||||
@@ -1674,9 +1688,16 @@ out_flush:
|
||||
|
||||
/* Make sure we do not overwrite the existing error */
|
||||
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
|
||||
/*
|
||||
* Don't have to report non-attempted pages here since:
|
||||
* - If the above loop is done gracefully all pages have been
|
||||
* attempted.
|
||||
* - If the above loop is aborted it means a fatal error
|
||||
* happened, should return ret.
|
||||
*/
|
||||
if (!err1)
|
||||
err1 = store_status(status, start, current_node, i - start);
|
||||
if (!err)
|
||||
if (err >= 0)
|
||||
err = err1;
|
||||
out:
|
||||
return err;
|
||||
@@ -2135,7 +2156,7 @@ static int migrate_vma_collect_hole(unsigned long start,
|
||||
struct migrate_vma *migrate = walk->private;
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
|
||||
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
||||
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
|
||||
migrate->dst[migrate->npages] = 0;
|
||||
migrate->npages++;
|
||||
@@ -2152,7 +2173,7 @@ static int migrate_vma_collect_skip(unsigned long start,
|
||||
struct migrate_vma *migrate = walk->private;
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
|
||||
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
||||
migrate->dst[migrate->npages] = 0;
|
||||
migrate->src[migrate->npages++] = 0;
|
||||
}
|
||||
@@ -2675,6 +2696,14 @@ int migrate_vma_setup(struct migrate_vma *args)
|
||||
}
|
||||
EXPORT_SYMBOL(migrate_vma_setup);
|
||||
|
||||
/*
|
||||
* This code closely matches the code in:
|
||||
* __handle_mm_fault()
|
||||
* handle_pte_fault()
|
||||
* do_anonymous_page()
|
||||
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
|
||||
* private page.
|
||||
*/
|
||||
static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
||||
unsigned long addr,
|
||||
struct page *page,
|
||||
@@ -2755,30 +2784,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
||||
|
||||
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
|
||||
|
||||
if (check_stable_address_space(mm))
|
||||
goto unlock_abort;
|
||||
|
||||
if (pte_present(*ptep)) {
|
||||
unsigned long pfn = pte_pfn(*ptep);
|
||||
|
||||
if (!is_zero_pfn(pfn)) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
goto abort;
|
||||
}
|
||||
if (!is_zero_pfn(pfn))
|
||||
goto unlock_abort;
|
||||
flush = true;
|
||||
} else if (!pte_none(*ptep)) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
goto abort;
|
||||
}
|
||||
} else if (!pte_none(*ptep))
|
||||
goto unlock_abort;
|
||||
|
||||
/*
|
||||
* Check for usefaultfd but do not deliver the fault. Instead,
|
||||
* Check for userfaultfd but do not deliver the fault. Instead,
|
||||
* just back off.
|
||||
*/
|
||||
if (userfaultfd_missing(vma)) {
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
goto abort;
|
||||
}
|
||||
if (userfaultfd_missing(vma))
|
||||
goto unlock_abort;
|
||||
|
||||
inc_mm_counter(mm, MM_ANONPAGES);
|
||||
page_add_new_anon_rmap(page, vma, addr, false);
|
||||
@@ -2802,6 +2825,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
||||
*src = MIGRATE_PFN_MIGRATE;
|
||||
return;
|
||||
|
||||
unlock_abort:
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
mem_cgroup_cancel_charge(page, memcg, false);
|
||||
abort:
|
||||
*src &= ~MIGRATE_PFN_MIGRATE;
|
||||
}
|
||||
@@ -2834,9 +2860,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
|
||||
}
|
||||
|
||||
if (!page) {
|
||||
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
|
||||
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
|
||||
continue;
|
||||
}
|
||||
if (!notified) {
|
||||
notified = true;
|
||||
|
||||
|
||||
30
mm/mmap.c
30
mm/mmap.c
@@ -1270,26 +1270,22 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
|
||||
*/
|
||||
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
|
||||
{
|
||||
struct anon_vma *anon_vma;
|
||||
struct vm_area_struct *near;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
|
||||
near = vma->vm_next;
|
||||
if (!near)
|
||||
goto try_prev;
|
||||
/* Try next first. */
|
||||
if (vma->vm_next) {
|
||||
anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
|
||||
if (anon_vma)
|
||||
return anon_vma;
|
||||
}
|
||||
|
||||
anon_vma = reusable_anon_vma(near, vma, near);
|
||||
if (anon_vma)
|
||||
return anon_vma;
|
||||
try_prev:
|
||||
near = vma->vm_prev;
|
||||
if (!near)
|
||||
goto none;
|
||||
/* Try prev next. */
|
||||
if (vma->vm_prev)
|
||||
anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
|
||||
|
||||
anon_vma = reusable_anon_vma(near, near, vma);
|
||||
if (anon_vma)
|
||||
return anon_vma;
|
||||
none:
|
||||
/*
|
||||
* We might reach here with anon_vma == NULL if we can't find
|
||||
* any reusable anon_vma.
|
||||
* There's no absolute need to look only at touching neighbours:
|
||||
* we could search further afield for "compatible" anon_vmas.
|
||||
* But it would probably just be a waste of time searching,
|
||||
@@ -1297,7 +1293,7 @@ none:
|
||||
* We're trying to allow mprotect remerging later on,
|
||||
* not trying to minimize memory used for anon_vmas.
|
||||
*/
|
||||
return NULL;
|
||||
return anon_vma;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sched/debug.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/timex.h>
|
||||
#include <linux/jiffies.h>
|
||||
@@ -620,6 +621,7 @@ static void oom_reap_task(struct task_struct *tsk)
|
||||
|
||||
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
||||
task_pid_nr(tsk), tsk->comm);
|
||||
sched_show_task(tsk);
|
||||
debug_show_all_locks();
|
||||
|
||||
done:
|
||||
|
||||
@@ -5848,6 +5848,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM
|
||||
/* Skip PFNs that belong to non-present sections */
|
||||
static inline __meminit unsigned long next_pfn(unsigned long pfn)
|
||||
{
|
||||
unsigned long section_nr;
|
||||
|
||||
section_nr = pfn_to_section_nr(++pfn);
|
||||
if (present_section_nr(section_nr))
|
||||
return pfn;
|
||||
|
||||
while (++section_nr <= __highest_present_section_nr) {
|
||||
if (present_section_nr(section_nr))
|
||||
return section_nr_to_pfn(section_nr);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
static inline __meminit unsigned long next_pfn(unsigned long pfn)
|
||||
{
|
||||
return pfn++;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initially all pages are reserved - free ones are freed
|
||||
* up by memblock_free_all() once the early boot process is
|
||||
@@ -5887,8 +5911,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
||||
* function. They do not exist on hotplugged memory.
|
||||
*/
|
||||
if (context == MEMMAP_EARLY) {
|
||||
if (!early_pfn_valid(pfn))
|
||||
if (!early_pfn_valid(pfn)) {
|
||||
pfn = next_pfn(pfn) - 1;
|
||||
continue;
|
||||
}
|
||||
if (!early_pfn_in_nid(pfn, nid))
|
||||
continue;
|
||||
if (overlap_memmap_init(zone, &pfn))
|
||||
@@ -8154,20 +8180,22 @@ void *__init alloc_large_system_hash(const char *tablename,
|
||||
|
||||
/*
|
||||
* This function checks whether pageblock includes unmovable pages or not.
|
||||
* If @count is not zero, it is okay to include less @count unmovable pages
|
||||
*
|
||||
* PageLRU check without isolation or lru_lock could race so that
|
||||
* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
|
||||
* check without lock_page also may miss some movable non-lru pages at
|
||||
* race condition. So you can't expect this function should be exact.
|
||||
*
|
||||
* Returns a page without holding a reference. If the caller wants to
|
||||
* dereference that page (e.g., dumping), it has to make sure that that it
|
||||
* cannot get removed (e.g., via memory unplug) concurrently.
|
||||
*
|
||||
*/
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
int migratetype, int flags)
|
||||
struct page *has_unmovable_pages(struct zone *zone, struct page *page,
|
||||
int migratetype, int flags)
|
||||
{
|
||||
unsigned long found;
|
||||
unsigned long iter = 0;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
const char *reason = "unmovable page";
|
||||
|
||||
/*
|
||||
* TODO we could make this much more efficient by not checking every
|
||||
@@ -8184,22 +8212,19 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
* so consider them movable here.
|
||||
*/
|
||||
if (is_migrate_cma(migratetype))
|
||||
return false;
|
||||
return NULL;
|
||||
|
||||
reason = "CMA page";
|
||||
goto unmovable;
|
||||
return page;
|
||||
}
|
||||
|
||||
for (found = 0; iter < pageblock_nr_pages; iter++) {
|
||||
unsigned long check = pfn + iter;
|
||||
|
||||
if (!pfn_valid_within(check))
|
||||
for (; iter < pageblock_nr_pages; iter++) {
|
||||
if (!pfn_valid_within(pfn + iter))
|
||||
continue;
|
||||
|
||||
page = pfn_to_page(check);
|
||||
page = pfn_to_page(pfn + iter);
|
||||
|
||||
if (PageReserved(page))
|
||||
goto unmovable;
|
||||
return page;
|
||||
|
||||
/*
|
||||
* If the zone is movable and we have ruled out all reserved
|
||||
@@ -8219,7 +8244,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
unsigned int skip_pages;
|
||||
|
||||
if (!hugepage_migration_supported(page_hstate(head)))
|
||||
goto unmovable;
|
||||
return page;
|
||||
|
||||
skip_pages = compound_nr(head) - (page - head);
|
||||
iter += skip_pages - 1;
|
||||
@@ -8245,11 +8270,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
|
||||
continue;
|
||||
|
||||
if (__PageMovable(page))
|
||||
if (__PageMovable(page) || PageLRU(page))
|
||||
continue;
|
||||
|
||||
if (!PageLRU(page))
|
||||
found++;
|
||||
/*
|
||||
* If there are RECLAIMABLE pages, we need to check
|
||||
* it. But now, memory offline itself doesn't call
|
||||
@@ -8263,15 +8286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
* is set to both of a memory hole page and a _used_ kernel
|
||||
* page at boot.
|
||||
*/
|
||||
if (found > count)
|
||||
goto unmovable;
|
||||
return page;
|
||||
}
|
||||
return false;
|
||||
unmovable:
|
||||
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
|
||||
if (flags & REPORT_FAILURE)
|
||||
dump_page(pfn_to_page(pfn + iter), reason);
|
||||
return true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CONTIG_ALLOC
|
||||
@@ -8675,10 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
BUG_ON(!PageBuddy(page));
|
||||
order = page_order(page);
|
||||
offlined_pages += 1 << order;
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
pr_info("remove from free list %lx %d %lx\n",
|
||||
pfn, 1 << order, end_pfn);
|
||||
#endif
|
||||
del_page_from_free_area(page, &zone->free_area[order]);
|
||||
pfn += (1 << order);
|
||||
}
|
||||
|
||||
@@ -17,10 +17,9 @@
|
||||
|
||||
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
|
||||
{
|
||||
struct page *unmovable = NULL;
|
||||
struct zone *zone;
|
||||
unsigned long flags, pfn;
|
||||
struct memory_isolate_notify arg;
|
||||
int notifier_ret;
|
||||
unsigned long flags;
|
||||
int ret = -EBUSY;
|
||||
|
||||
zone = page_zone(page);
|
||||
@@ -35,41 +34,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
|
||||
if (is_migrate_isolate_page(page))
|
||||
goto out;
|
||||
|
||||
pfn = page_to_pfn(page);
|
||||
arg.start_pfn = pfn;
|
||||
arg.nr_pages = pageblock_nr_pages;
|
||||
arg.pages_found = 0;
|
||||
|
||||
/*
|
||||
* It may be possible to isolate a pageblock even if the
|
||||
* migratetype is not MIGRATE_MOVABLE. The memory isolation
|
||||
* notifier chain is used by balloon drivers to return the
|
||||
* number of pages in a range that are held by the balloon
|
||||
* driver to shrink memory. If all the pages are accounted for
|
||||
* by balloons, are free, or on the LRU, isolation can continue.
|
||||
* Later, for example, when memory hotplug notifier runs, these
|
||||
* pages reported as "can be isolated" should be isolated(freed)
|
||||
* by the balloon driver through the memory notifier chain.
|
||||
*/
|
||||
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
|
||||
notifier_ret = notifier_to_errno(notifier_ret);
|
||||
if (notifier_ret)
|
||||
goto out;
|
||||
/*
|
||||
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
|
||||
* We just check MOVABLE pages.
|
||||
*/
|
||||
if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
|
||||
isol_flags))
|
||||
ret = 0;
|
||||
|
||||
/*
|
||||
* immobile means "not-on-lru" pages. If immobile is larger than
|
||||
* removable-by-driver pages reported by notifier, we'll fail.
|
||||
*/
|
||||
|
||||
out:
|
||||
if (!ret) {
|
||||
unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
|
||||
if (!unmovable) {
|
||||
unsigned long nr_pages;
|
||||
int mt = get_pageblock_migratetype(page);
|
||||
|
||||
@@ -79,11 +49,24 @@ out:
|
||||
NULL);
|
||||
|
||||
__mod_zone_freepage_state(zone, -nr_pages, mt);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
drain_all_pages(zone);
|
||||
} else {
|
||||
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
|
||||
|
||||
if ((isol_flags & REPORT_FAILURE) && unmovable)
|
||||
/*
|
||||
* printk() with zone->lock held will likely trigger a
|
||||
* lockdep splat, so defer it here.
|
||||
*/
|
||||
dump_page(unmovable, "unmovable page");
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -52,12 +52,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn)
|
||||
static inline bool pfn_is_match(struct page *page, unsigned long pfn)
|
||||
{
|
||||
unsigned long hpage_pfn = page_to_pfn(hpage);
|
||||
unsigned long page_pfn = page_to_pfn(page);
|
||||
|
||||
/* normal page and hugetlbfs page */
|
||||
if (!PageTransCompound(page) || PageHuge(page))
|
||||
return page_pfn == pfn;
|
||||
|
||||
/* THP can be referenced by any subpage */
|
||||
return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage);
|
||||
return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -108,7 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
|
||||
pfn = pte_pfn(*pvmw->pte);
|
||||
}
|
||||
|
||||
return pfn_in_hpage(pvmw->page, pfn);
|
||||
return pfn_is_match(pvmw->page, pfn);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages,
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
|
||||
if (vm_write) {
|
||||
if (vm_write)
|
||||
copied = copy_page_from_iter(page, offset, copy, iter);
|
||||
set_page_dirty_lock(page);
|
||||
} else {
|
||||
else
|
||||
copied = copy_page_to_iter(page, offset, copy, iter);
|
||||
}
|
||||
|
||||
len -= copied;
|
||||
if (copied < copy && iov_iter_count(iter))
|
||||
return -EFAULT;
|
||||
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
|
||||
flags |= FOLL_WRITE;
|
||||
|
||||
while (!rc && nr_pages && iov_iter_count(iter)) {
|
||||
int pages = min(nr_pages, max_pages_per_loop);
|
||||
int pinned_pages = min(nr_pages, max_pages_per_loop);
|
||||
int locked = 1;
|
||||
size_t bytes;
|
||||
|
||||
@@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
|
||||
* current/current->mm
|
||||
*/
|
||||
down_read(&mm->mmap_sem);
|
||||
pages = get_user_pages_remote(task, mm, pa, pages, flags,
|
||||
process_pages, NULL, &locked);
|
||||
pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
|
||||
flags, process_pages,
|
||||
NULL, &locked);
|
||||
if (locked)
|
||||
up_read(&mm->mmap_sem);
|
||||
if (pages <= 0)
|
||||
if (pinned_pages <= 0)
|
||||
return -EFAULT;
|
||||
|
||||
bytes = pages * PAGE_SIZE - start_offset;
|
||||
bytes = pinned_pages * PAGE_SIZE - start_offset;
|
||||
if (bytes > len)
|
||||
bytes = len;
|
||||
|
||||
@@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
|
||||
vm_write);
|
||||
len -= bytes;
|
||||
start_offset = 0;
|
||||
nr_pages -= pages;
|
||||
pa += pages * PAGE_SIZE;
|
||||
while (pages)
|
||||
put_page(process_pages[--pages]);
|
||||
nr_pages -= pinned_pages;
|
||||
pa += pinned_pages * PAGE_SIZE;
|
||||
|
||||
/* If vm_write is set, the pages need to be made dirty: */
|
||||
unpin_user_pages_dirty_lock(process_pages, pinned_pages,
|
||||
vm_write);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
||||
88
mm/slub.c
88
mm/slub.c
@@ -439,19 +439,38 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SLUB_DEBUG
|
||||
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
|
||||
static DEFINE_SPINLOCK(object_map_lock);
|
||||
|
||||
/*
|
||||
* Determine a map of object in use on a page.
|
||||
*
|
||||
* Node listlock must be held to guarantee that the page does
|
||||
* not vanish from under us.
|
||||
*/
|
||||
static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
|
||||
static unsigned long *get_map(struct kmem_cache *s, struct page *page)
|
||||
{
|
||||
void *p;
|
||||
void *addr = page_address(page);
|
||||
|
||||
VM_BUG_ON(!irqs_disabled());
|
||||
|
||||
spin_lock(&object_map_lock);
|
||||
|
||||
bitmap_zero(object_map, page->objects);
|
||||
|
||||
for (p = page->freelist; p; p = get_freepointer(s, p))
|
||||
set_bit(slab_index(p, s, addr), map);
|
||||
set_bit(slab_index(p, s, addr), object_map);
|
||||
|
||||
return object_map;
|
||||
}
|
||||
|
||||
static void put_map(unsigned long *map)
|
||||
{
|
||||
VM_BUG_ON(map != object_map);
|
||||
lockdep_assert_held(&object_map_lock);
|
||||
|
||||
spin_unlock(&object_map_lock);
|
||||
}
|
||||
|
||||
static inline unsigned int size_from_object(struct kmem_cache *s)
|
||||
@@ -3675,13 +3694,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
|
||||
#ifdef CONFIG_SLUB_DEBUG
|
||||
void *addr = page_address(page);
|
||||
void *p;
|
||||
unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
|
||||
if (!map)
|
||||
return;
|
||||
unsigned long *map;
|
||||
|
||||
slab_err(s, page, text, s->name);
|
||||
slab_lock(page);
|
||||
|
||||
get_map(s, page, map);
|
||||
map = get_map(s, page);
|
||||
for_each_object(p, s, addr, page->objects) {
|
||||
|
||||
if (!test_bit(slab_index(p, s, addr), map)) {
|
||||
@@ -3689,8 +3707,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
|
||||
print_tracking(s, p);
|
||||
}
|
||||
}
|
||||
put_map(map);
|
||||
|
||||
slab_unlock(page);
|
||||
bitmap_free(map);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -4384,19 +4403,19 @@ static int count_total(struct page *page)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SLUB_DEBUG
|
||||
static void validate_slab(struct kmem_cache *s, struct page *page,
|
||||
unsigned long *map)
|
||||
static void validate_slab(struct kmem_cache *s, struct page *page)
|
||||
{
|
||||
void *p;
|
||||
void *addr = page_address(page);
|
||||
unsigned long *map;
|
||||
|
||||
slab_lock(page);
|
||||
|
||||
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
|
||||
return;
|
||||
goto unlock;
|
||||
|
||||
/* Now we know that a valid freelist exists */
|
||||
bitmap_zero(map, page->objects);
|
||||
|
||||
get_map(s, page, map);
|
||||
map = get_map(s, page);
|
||||
for_each_object(p, s, addr, page->objects) {
|
||||
u8 val = test_bit(slab_index(p, s, addr), map) ?
|
||||
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
|
||||
@@ -4404,18 +4423,13 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
|
||||
if (!check_object(s, page, p, val))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void validate_slab_slab(struct kmem_cache *s, struct page *page,
|
||||
unsigned long *map)
|
||||
{
|
||||
slab_lock(page);
|
||||
validate_slab(s, page, map);
|
||||
put_map(map);
|
||||
unlock:
|
||||
slab_unlock(page);
|
||||
}
|
||||
|
||||
static int validate_slab_node(struct kmem_cache *s,
|
||||
struct kmem_cache_node *n, unsigned long *map)
|
||||
struct kmem_cache_node *n)
|
||||
{
|
||||
unsigned long count = 0;
|
||||
struct page *page;
|
||||
@@ -4424,7 +4438,7 @@ static int validate_slab_node(struct kmem_cache *s,
|
||||
spin_lock_irqsave(&n->list_lock, flags);
|
||||
|
||||
list_for_each_entry(page, &n->partial, slab_list) {
|
||||
validate_slab_slab(s, page, map);
|
||||
validate_slab(s, page);
|
||||
count++;
|
||||
}
|
||||
if (count != n->nr_partial)
|
||||
@@ -4435,7 +4449,7 @@ static int validate_slab_node(struct kmem_cache *s,
|
||||
goto out;
|
||||
|
||||
list_for_each_entry(page, &n->full, slab_list) {
|
||||
validate_slab_slab(s, page, map);
|
||||
validate_slab(s, page);
|
||||
count++;
|
||||
}
|
||||
if (count != atomic_long_read(&n->nr_slabs))
|
||||
@@ -4452,15 +4466,11 @@ static long validate_slab_cache(struct kmem_cache *s)
|
||||
int node;
|
||||
unsigned long count = 0;
|
||||
struct kmem_cache_node *n;
|
||||
unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
|
||||
|
||||
if (!map)
|
||||
return -ENOMEM;
|
||||
|
||||
flush_all(s);
|
||||
for_each_kmem_cache_node(s, node, n)
|
||||
count += validate_slab_node(s, n, map);
|
||||
bitmap_free(map);
|
||||
count += validate_slab_node(s, n);
|
||||
|
||||
return count;
|
||||
}
|
||||
/*
|
||||
@@ -4590,18 +4600,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
|
||||
}
|
||||
|
||||
static void process_slab(struct loc_track *t, struct kmem_cache *s,
|
||||
struct page *page, enum track_item alloc,
|
||||
unsigned long *map)
|
||||
struct page *page, enum track_item alloc)
|
||||
{
|
||||
void *addr = page_address(page);
|
||||
void *p;
|
||||
unsigned long *map;
|
||||
|
||||
bitmap_zero(map, page->objects);
|
||||
get_map(s, page, map);
|
||||
|
||||
map = get_map(s, page);
|
||||
for_each_object(p, s, addr, page->objects)
|
||||
if (!test_bit(slab_index(p, s, addr), map))
|
||||
add_location(t, s, get_track(s, p, alloc));
|
||||
put_map(map);
|
||||
}
|
||||
|
||||
static int list_locations(struct kmem_cache *s, char *buf,
|
||||
@@ -4612,11 +4621,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
|
||||
struct loc_track t = { 0, 0, NULL };
|
||||
int node;
|
||||
struct kmem_cache_node *n;
|
||||
unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
|
||||
|
||||
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
|
||||
GFP_KERNEL)) {
|
||||
bitmap_free(map);
|
||||
if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
|
||||
GFP_KERNEL)) {
|
||||
return sprintf(buf, "Out of memory\n");
|
||||
}
|
||||
/* Push back cpu slabs */
|
||||
@@ -4631,9 +4638,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
|
||||
|
||||
spin_lock_irqsave(&n->list_lock, flags);
|
||||
list_for_each_entry(page, &n->partial, slab_list)
|
||||
process_slab(&t, s, page, alloc, map);
|
||||
process_slab(&t, s, page, alloc);
|
||||
list_for_each_entry(page, &n->full, slab_list)
|
||||
process_slab(&t, s, page, alloc, map);
|
||||
process_slab(&t, s, page, alloc);
|
||||
spin_unlock_irqrestore(&n->list_lock, flags);
|
||||
}
|
||||
|
||||
@@ -4682,7 +4689,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
|
||||
}
|
||||
|
||||
free_loc_track(&t);
|
||||
bitmap_free(map);
|
||||
if (!t.count)
|
||||
len += sprintf(buf, "No data\n");
|
||||
return len;
|
||||
|
||||
@@ -789,7 +789,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
|
||||
ms->usage = NULL;
|
||||
}
|
||||
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
|
||||
ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
|
||||
ms->section_mem_map = (unsigned long)NULL;
|
||||
}
|
||||
|
||||
if (section_is_early && memmap)
|
||||
|
||||
27
mm/swap.c
27
mm/swap.c
@@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr)
|
||||
* processing, and instead, expect a call to
|
||||
* put_page_testzero().
|
||||
*/
|
||||
if (put_devmap_managed_page(page))
|
||||
if (page_is_devmap_managed(page)) {
|
||||
put_devmap_managed_page(page);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
@@ -1102,3 +1104,26 @@ void __init swap_setup(void)
|
||||
* _really_ don't want to cluster much more
|
||||
*/
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count;
|
||||
|
||||
if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
|
||||
return;
|
||||
|
||||
count = page_ref_dec_return(page);
|
||||
|
||||
/*
|
||||
* devmap page refcounts are 1-based, rather than 0-based: if
|
||||
* refcount is 1, then the page is free and the refcount is
|
||||
* stable because nobody holds a reference on the page.
|
||||
*/
|
||||
if (count == 1)
|
||||
free_devmap_managed_page(page);
|
||||
else if (!count)
|
||||
__put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL(put_devmap_managed_page);
|
||||
#endif
|
||||
|
||||
@@ -2737,10 +2737,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
|
||||
else
|
||||
type = si->type + 1;
|
||||
|
||||
++(*pos);
|
||||
for (; (si = swap_type_to_swap_info(type)); type++) {
|
||||
if (!(si->flags & SWP_USED) || !si->swap_map)
|
||||
continue;
|
||||
++*pos;
|
||||
return si;
|
||||
}
|
||||
|
||||
|
||||
24
mm/vmscan.c
24
mm/vmscan.c
@@ -146,20 +146,6 @@ struct scan_control {
|
||||
struct reclaim_state reclaim_state;
|
||||
};
|
||||
|
||||
#ifdef ARCH_HAS_PREFETCH
|
||||
#define prefetch_prev_lru_page(_page, _base, _field) \
|
||||
do { \
|
||||
if ((_page)->lru.prev != _base) { \
|
||||
struct page *prev; \
|
||||
\
|
||||
prev = lru_to_page(&(_page->lru)); \
|
||||
prefetch(&prev->_field); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_HAS_PREFETCHW
|
||||
#define prefetchw_prev_lru_page(_page, _base, _field) \
|
||||
do { \
|
||||
@@ -2695,7 +2681,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
|
||||
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
|
||||
}
|
||||
|
||||
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct reclaim_state *reclaim_state = current->reclaim_state;
|
||||
unsigned long nr_reclaimed, nr_scanned;
|
||||
@@ -2874,8 +2860,6 @@ again:
|
||||
*/
|
||||
if (reclaimable)
|
||||
pgdat->kswapd_failures = 0;
|
||||
|
||||
return reclaimable;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4126,10 +4110,8 @@ module_init(kswapd_init)
|
||||
*/
|
||||
int node_reclaim_mode __read_mostly;
|
||||
|
||||
#define RECLAIM_OFF 0
|
||||
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
|
||||
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
|
||||
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
|
||||
#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
|
||||
#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
|
||||
|
||||
/*
|
||||
* Priority for NODE_RECLAIM. This determines the fraction of pages
|
||||
|
||||
88
mm/zswap.c
88
mm/zswap.c
@@ -32,6 +32,7 @@
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
/*********************************
|
||||
* statistics
|
||||
@@ -65,6 +66,11 @@ static u64 zswap_reject_kmemcache_fail;
|
||||
/* Duplicate store was encountered (rare) */
|
||||
static u64 zswap_duplicate_entry;
|
||||
|
||||
/* Shrinker work queue */
|
||||
static struct workqueue_struct *shrink_wq;
|
||||
/* Pool limit was hit, we need to calm down */
|
||||
static bool zswap_pool_reached_full;
|
||||
|
||||
/*********************************
|
||||
* tunables
|
||||
**********************************/
|
||||
@@ -109,6 +115,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
|
||||
static unsigned int zswap_max_pool_percent = 20;
|
||||
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
|
||||
|
||||
/* The threshold for accepting new pages after the max_pool_percent was hit */
|
||||
static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
|
||||
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
|
||||
uint, 0644);
|
||||
|
||||
/* Enable/disable handling same-value filled pages (enabled by default) */
|
||||
static bool zswap_same_filled_pages_enabled = true;
|
||||
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
|
||||
@@ -123,7 +134,8 @@ struct zswap_pool {
|
||||
struct crypto_comp * __percpu *tfm;
|
||||
struct kref kref;
|
||||
struct list_head list;
|
||||
struct work_struct work;
|
||||
struct work_struct release_work;
|
||||
struct work_struct shrink_work;
|
||||
struct hlist_node node;
|
||||
char tfm_name[CRYPTO_MAX_ALG_NAME];
|
||||
};
|
||||
@@ -214,6 +226,13 @@ static bool zswap_is_full(void)
|
||||
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
|
||||
}
|
||||
|
||||
static bool zswap_can_accept(void)
|
||||
{
|
||||
return totalram_pages() * zswap_accept_thr_percent / 100 *
|
||||
zswap_max_pool_percent / 100 >
|
||||
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
|
||||
}
|
||||
|
||||
static void zswap_update_total_size(void)
|
||||
{
|
||||
struct zswap_pool *pool;
|
||||
@@ -501,6 +520,16 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void shrink_worker(struct work_struct *w)
|
||||
{
|
||||
struct zswap_pool *pool = container_of(w, typeof(*pool),
|
||||
shrink_work);
|
||||
|
||||
if (zpool_shrink(pool->zpool, 1, NULL))
|
||||
zswap_reject_reclaim_fail++;
|
||||
zswap_pool_put(pool);
|
||||
}
|
||||
|
||||
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
|
||||
{
|
||||
struct zswap_pool *pool;
|
||||
@@ -551,6 +580,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
|
||||
*/
|
||||
kref_init(&pool->kref);
|
||||
INIT_LIST_HEAD(&pool->list);
|
||||
INIT_WORK(&pool->shrink_work, shrink_worker);
|
||||
|
||||
zswap_pool_debug("created", pool);
|
||||
|
||||
@@ -624,7 +654,8 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
|
||||
|
||||
static void __zswap_pool_release(struct work_struct *work)
|
||||
{
|
||||
struct zswap_pool *pool = container_of(work, typeof(*pool), work);
|
||||
struct zswap_pool *pool = container_of(work, typeof(*pool),
|
||||
release_work);
|
||||
|
||||
synchronize_rcu();
|
||||
|
||||
@@ -647,8 +678,8 @@ static void __zswap_pool_empty(struct kref *kref)
|
||||
|
||||
list_del_rcu(&pool->list);
|
||||
|
||||
INIT_WORK(&pool->work, __zswap_pool_release);
|
||||
schedule_work(&pool->work);
|
||||
INIT_WORK(&pool->release_work, __zswap_pool_release);
|
||||
schedule_work(&pool->release_work);
|
||||
|
||||
spin_unlock(&zswap_pools_lock);
|
||||
}
|
||||
@@ -942,22 +973,6 @@ end:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int zswap_shrink(void)
|
||||
{
|
||||
struct zswap_pool *pool;
|
||||
int ret;
|
||||
|
||||
pool = zswap_pool_last_get();
|
||||
if (!pool)
|
||||
return -ENOENT;
|
||||
|
||||
ret = zpool_shrink(pool->zpool, 1, NULL);
|
||||
|
||||
zswap_pool_put(pool);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
|
||||
{
|
||||
unsigned int pos;
|
||||
@@ -1011,21 +1026,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
|
||||
|
||||
/* reclaim space if needed */
|
||||
if (zswap_is_full()) {
|
||||
zswap_pool_limit_hit++;
|
||||
if (zswap_shrink()) {
|
||||
zswap_reject_reclaim_fail++;
|
||||
ret = -ENOMEM;
|
||||
goto reject;
|
||||
}
|
||||
struct zswap_pool *pool;
|
||||
|
||||
/* A second zswap_is_full() check after
|
||||
* zswap_shrink() to make sure it's now
|
||||
* under the max_pool_percent
|
||||
*/
|
||||
if (zswap_is_full()) {
|
||||
zswap_pool_limit_hit++;
|
||||
zswap_pool_reached_full = true;
|
||||
pool = zswap_pool_last_get();
|
||||
if (pool)
|
||||
queue_work(shrink_wq, &pool->shrink_work);
|
||||
ret = -ENOMEM;
|
||||
goto reject;
|
||||
}
|
||||
|
||||
if (zswap_pool_reached_full) {
|
||||
if (!zswap_can_accept()) {
|
||||
ret = -ENOMEM;
|
||||
goto reject;
|
||||
}
|
||||
} else
|
||||
zswap_pool_reached_full = false;
|
||||
}
|
||||
|
||||
/* allocate entry */
|
||||
@@ -1332,11 +1349,18 @@ static int __init init_zswap(void)
|
||||
zswap_enabled = false;
|
||||
}
|
||||
|
||||
shrink_wq = create_workqueue("zswap-shrink");
|
||||
if (!shrink_wq)
|
||||
goto fallback_fail;
|
||||
|
||||
frontswap_register_ops(&zswap_frontswap_ops);
|
||||
if (zswap_debugfs_init())
|
||||
pr_warn("debugfs initialization failed\n");
|
||||
return 0;
|
||||
|
||||
fallback_fail:
|
||||
if (pool)
|
||||
zswap_pool_destroy(pool);
|
||||
hp_fail:
|
||||
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
|
||||
dstmem_fail:
|
||||
|
||||
Reference in New Issue
Block a user