Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "13 patches. Subsystems affected by this patch series: mm (memory-failure, memcg, userfaultfd, hugetlbfs, mremap, oom-kill, kasan, hmm), and kcov" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() kcov: don't generate a warning on vm_insert_page()'s failure MAINTAINERS: add Vincenzo Frascino to KASAN reviewers oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup selftest/vm: add skip support to mremap_test selftest/vm: support xfail in mremap_test selftest/vm: verify remap destination address in mremap_test selftest/vm: verify mmap addr in mremap_test mm, hugetlb: allow for "high" userspace addresses userfaultfd: mark uffd_wp regardless of VM_WRITE flag memcg: sync flush only if periodic flush is delayed mm/memory-failure.c: skip huge_zero_page in memory_failure() mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()
This commit is contained in:
commit
281b9d9a4b
@ -10547,6 +10547,7 @@ M: Andrey Ryabinin <ryabinin.a.a@gmail.com>
|
||||
R: Alexander Potapenko <glider@google.com>
|
||||
R: Andrey Konovalov <andreyknvl@gmail.com>
|
||||
R: Dmitry Vyukov <dvyukov@google.com>
|
||||
R: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
||||
L: kasan-dev@googlegroups.com
|
||||
S: Maintained
|
||||
F: Documentation/dev-tools/kasan.rst
|
||||
|
@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
|
||||
info.flags = 0;
|
||||
info.length = len;
|
||||
info.low_limit = current->mm->mmap_base;
|
||||
info.high_limit = TASK_SIZE;
|
||||
info.high_limit = arch_get_mmap_end(addr);
|
||||
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
||||
info.align_offset = 0;
|
||||
return vm_unmapped_area(&info);
|
||||
@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
|
||||
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
||||
info.length = len;
|
||||
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
|
||||
info.high_limit = current->mm->mmap_base;
|
||||
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
|
||||
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
||||
info.align_offset = 0;
|
||||
addr = vm_unmapped_area(&info);
|
||||
@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
|
||||
VM_BUG_ON(addr != -ENOMEM);
|
||||
info.flags = 0;
|
||||
info.low_limit = current->mm->mmap_base;
|
||||
info.high_limit = TASK_SIZE;
|
||||
info.high_limit = arch_get_mmap_end(addr);
|
||||
addr = vm_unmapped_area(&info);
|
||||
}
|
||||
|
||||
@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
struct hstate *h = hstate_file(file);
|
||||
const unsigned long mmap_end = arch_get_mmap_end(addr);
|
||||
|
||||
if (len & ~huge_page_mask(h))
|
||||
return -EINVAL;
|
||||
@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
if (addr) {
|
||||
addr = ALIGN(addr, huge_page_size(h));
|
||||
vma = find_vma(mm, addr);
|
||||
if (TASK_SIZE - len >= addr &&
|
||||
if (mmap_end - len >= addr &&
|
||||
(!vma || addr + len <= vm_start_gap(vma)))
|
||||
return addr;
|
||||
}
|
||||
|
@ -169,6 +169,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||
long freed);
|
||||
bool isolate_huge_page(struct page *page, struct list_head *list);
|
||||
int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
|
||||
int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
|
||||
void putback_active_hugepage(struct page *page);
|
||||
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
|
||||
void free_huge_page(struct page *page);
|
||||
@ -378,6 +379,11 @@ static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void putback_active_hugepage(struct page *page)
|
||||
{
|
||||
}
|
||||
|
@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats(void);
|
||||
void mem_cgroup_flush_stats_delayed(void);
|
||||
|
||||
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
|
||||
int val);
|
||||
@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stats(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_flush_stats_delayed(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
|
||||
enum node_stat_item idx, int val)
|
||||
{
|
||||
|
@ -3197,6 +3197,14 @@ extern int sysctl_memory_failure_recovery;
|
||||
extern void shake_page(struct page *p);
|
||||
extern atomic_long_t num_poisoned_pages __read_mostly;
|
||||
extern int soft_offline_page(unsigned long pfn, int flags);
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
|
||||
#else
|
||||
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_memory_failure
|
||||
static inline int arch_memory_failure(unsigned long pfn, int flags)
|
||||
|
@ -1443,6 +1443,7 @@ struct task_struct {
|
||||
int pagefault_disabled;
|
||||
#ifdef CONFIG_MMU
|
||||
struct task_struct *oom_reaper_list;
|
||||
struct timer_list oom_reaper_timer;
|
||||
#endif
|
||||
#ifdef CONFIG_VMAP_STACK
|
||||
struct vm_struct *stack_vm_area;
|
||||
|
@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#ifndef arch_get_mmap_end
|
||||
#define arch_get_mmap_end(addr) (TASK_SIZE)
|
||||
#endif
|
||||
|
||||
#ifndef arch_get_mmap_base
|
||||
#define arch_get_mmap_base(addr, base) (base)
|
||||
#endif
|
||||
|
||||
extern void arch_pick_mmap_layout(struct mm_struct *mm,
|
||||
struct rlimit *rlim_stack);
|
||||
extern unsigned long
|
||||
|
@ -475,8 +475,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
|
||||
vma->vm_flags |= VM_DONTEXPAND;
|
||||
for (off = 0; off < size; off += PAGE_SIZE) {
|
||||
page = vmalloc_to_page(kcov->area + off);
|
||||
if (vm_insert_page(vma, vma->vm_start + off, page))
|
||||
WARN_ONCE(1, "vm_insert_page() failed");
|
||||
res = vm_insert_page(vma, vma->vm_start + off, page);
|
||||
if (res) {
|
||||
pr_warn_once("kcov: vm_insert_page() failed\n");
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
exit:
|
||||
|
10
mm/hugetlb.c
10
mm/hugetlb.c
@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
spin_lock_irq(&hugetlb_lock);
|
||||
ret = __get_huge_page_for_hwpoison(pfn, flags);
|
||||
spin_unlock_irq(&hugetlb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void putback_active_hugepage(struct page *page)
|
||||
{
|
||||
spin_lock_irq(&hugetlb_lock);
|
||||
|
@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
|
||||
static DEFINE_SPINLOCK(stats_flush_lock);
|
||||
static DEFINE_PER_CPU(unsigned int, stats_updates);
|
||||
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
|
||||
static u64 flush_next_time;
|
||||
|
||||
#define FLUSH_TIME (2UL*HZ)
|
||||
|
||||
/*
|
||||
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
|
||||
@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void)
|
||||
if (!spin_trylock_irqsave(&stats_flush_lock, flag))
|
||||
return;
|
||||
|
||||
flush_next_time = jiffies_64 + 2*FLUSH_TIME;
|
||||
cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
|
||||
atomic_set(&stats_flush_threshold, 0);
|
||||
spin_unlock_irqrestore(&stats_flush_lock, flag);
|
||||
@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void)
|
||||
__mem_cgroup_flush_stats();
|
||||
}
|
||||
|
||||
void mem_cgroup_flush_stats_delayed(void)
|
||||
{
|
||||
if (time_after64(jiffies_64, flush_next_time))
|
||||
mem_cgroup_flush_stats();
|
||||
}
|
||||
|
||||
static void flush_memcg_stats_dwork(struct work_struct *w)
|
||||
{
|
||||
__mem_cgroup_flush_stats();
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int memory_failure_hugetlb(unsigned long pfn, int flags)
|
||||
/*
|
||||
* Called from hugetlb code with hugetlb_lock held.
|
||||
*
|
||||
* Return values:
|
||||
* 0 - free hugepage
|
||||
* 1 - in-use hugepage
|
||||
* 2 - not a hugepage
|
||||
* -EBUSY - the hugepage is busy (try to retry)
|
||||
* -EHWPOISON - the hugepage is already hwpoisoned
|
||||
*/
|
||||
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
|
||||
{
|
||||
struct page *p = pfn_to_page(pfn);
|
||||
struct page *head = compound_head(p);
|
||||
int res;
|
||||
unsigned long page_flags;
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
struct page *head = compound_head(page);
|
||||
int ret = 2; /* fallback to normal page handling */
|
||||
bool count_increased = false;
|
||||
|
||||
if (!PageHeadHuge(head))
|
||||
goto out;
|
||||
|
||||
if (flags & MF_COUNT_INCREASED) {
|
||||
ret = 1;
|
||||
count_increased = true;
|
||||
} else if (HPageFreed(head) || HPageMigratable(head)) {
|
||||
ret = get_page_unless_zero(head);
|
||||
if (ret)
|
||||
count_increased = true;
|
||||
} else {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (TestSetPageHWPoison(head)) {
|
||||
pr_err("Memory failure: %#lx: already hardware poisoned\n",
|
||||
pfn);
|
||||
res = -EHWPOISON;
|
||||
if (flags & MF_ACTION_REQUIRED)
|
||||
ret = -EHWPOISON;
|
||||
goto out;
|
||||
}
|
||||
|
||||
return ret;
|
||||
out:
|
||||
if (count_increased)
|
||||
put_page(head);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
/*
|
||||
* Taking refcount of hugetlb pages needs extra care about race conditions
|
||||
* with basic operations like hugepage allocation/free/demotion.
|
||||
* So some of prechecks for hwpoison (pinning, and testing/setting
|
||||
* PageHWPoison) should be done in single hugetlb_lock range.
|
||||
*/
|
||||
static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
|
||||
{
|
||||
int res;
|
||||
struct page *p = pfn_to_page(pfn);
|
||||
struct page *head;
|
||||
unsigned long page_flags;
|
||||
bool retry = true;
|
||||
|
||||
*hugetlb = 1;
|
||||
retry:
|
||||
res = get_huge_page_for_hwpoison(pfn, flags);
|
||||
if (res == 2) { /* fallback to normal page handling */
|
||||
*hugetlb = 0;
|
||||
return 0;
|
||||
} else if (res == -EHWPOISON) {
|
||||
pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
|
||||
if (flags & MF_ACTION_REQUIRED) {
|
||||
head = compound_head(p);
|
||||
res = kill_accessing_process(current, page_to_pfn(head), flags);
|
||||
}
|
||||
return res;
|
||||
} else if (res == -EBUSY) {
|
||||
if (retry) {
|
||||
retry = false;
|
||||
goto retry;
|
||||
}
|
||||
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
|
||||
return res;
|
||||
}
|
||||
|
||||
head = compound_head(p);
|
||||
lock_page(head);
|
||||
|
||||
if (hwpoison_filter(p)) {
|
||||
ClearPageHWPoison(head);
|
||||
res = -EOPNOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
num_poisoned_pages_inc();
|
||||
|
||||
if (!(flags & MF_COUNT_INCREASED)) {
|
||||
res = get_hwpoison_page(p, flags);
|
||||
if (!res) {
|
||||
lock_page(head);
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(head))
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(head);
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
unlock_page(head);
|
||||
res = MF_FAILED;
|
||||
if (__page_handle_poison(p)) {
|
||||
page_ref_inc(p);
|
||||
res = MF_RECOVERED;
|
||||
}
|
||||
action_result(pfn, MF_MSG_FREE_HUGE, res);
|
||||
return res == MF_RECOVERED ? 0 : -EBUSY;
|
||||
} else if (res < 0) {
|
||||
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
|
||||
return -EBUSY;
|
||||
/*
|
||||
* Handling free hugepage. The possible race with hugepage allocation
|
||||
* or demotion can be prevented by PageHWPoison flag.
|
||||
*/
|
||||
if (res == 0) {
|
||||
unlock_page(head);
|
||||
res = MF_FAILED;
|
||||
if (__page_handle_poison(p)) {
|
||||
page_ref_inc(p);
|
||||
res = MF_RECOVERED;
|
||||
}
|
||||
action_result(pfn, MF_MSG_FREE_HUGE, res);
|
||||
return res == MF_RECOVERED ? 0 : -EBUSY;
|
||||
}
|
||||
|
||||
lock_page(head);
|
||||
|
||||
/*
|
||||
* The page could have changed compound pages due to race window.
|
||||
* If this happens just bail out.
|
||||
@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
|
||||
|
||||
page_flags = head->flags;
|
||||
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(head))
|
||||
num_poisoned_pages_dec();
|
||||
put_page(p);
|
||||
res = -EOPNOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
|
||||
* simply disable it. In order to make it work properly, we need
|
||||
@ -1588,6 +1643,12 @@ out:
|
||||
unlock_page(head);
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
||||
struct dev_pagemap *pgmap)
|
||||
@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
int res = 0;
|
||||
unsigned long page_flags;
|
||||
bool retry = true;
|
||||
int hugetlb = 0;
|
||||
|
||||
if (!sysctl_memory_failure_recovery)
|
||||
panic("Memory failure on page %lx", pfn);
|
||||
@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
}
|
||||
|
||||
try_again:
|
||||
if (PageHuge(p)) {
|
||||
res = memory_failure_hugetlb(pfn, flags);
|
||||
res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
|
||||
if (hugetlb)
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (TestSetPageHWPoison(p)) {
|
||||
pr_err("Memory failure: %#lx: already hardware poisoned\n",
|
||||
@ -1799,6 +1860,19 @@ try_again:
|
||||
}
|
||||
|
||||
if (PageTransHuge(hpage)) {
|
||||
/*
|
||||
* Bail out before SetPageHasHWPoisoned() if hpage is
|
||||
* huge_zero_page, although PG_has_hwpoisoned is not
|
||||
* checked in set_huge_zero_page().
|
||||
*
|
||||
* TODO: Handle memory failure of huge_zero_page thoroughly.
|
||||
*/
|
||||
if (is_huge_zero_page(hpage)) {
|
||||
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
|
||||
res = -EBUSY;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
/*
|
||||
* The flag must be set after the refcount is bumped
|
||||
* otherwise it may race with THP split.
|
||||
|
@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
|
||||
return addr;
|
||||
}
|
||||
|
||||
#ifndef arch_get_mmap_end
|
||||
#define arch_get_mmap_end(addr) (TASK_SIZE)
|
||||
#endif
|
||||
|
||||
#ifndef arch_get_mmap_base
|
||||
#define arch_get_mmap_base(addr, base) (base)
|
||||
#endif
|
||||
|
||||
/* Get an address range which is currently unmapped.
|
||||
* For shmat() with addr=0.
|
||||
*
|
||||
|
@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
|
||||
|
||||
static bool
|
||||
mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
|
||||
unsigned long seq)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
spin_lock(&subscriptions->lock);
|
||||
ret = subscriptions->invalidate_seq != seq;
|
||||
spin_unlock(&subscriptions->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_notifier_remove - Remove a interval notifier
|
||||
* @interval_sub: Interval subscription to unregister
|
||||
@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
|
||||
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
|
||||
if (seq)
|
||||
wait_event(subscriptions->wq,
|
||||
READ_ONCE(subscriptions->invalidate_seq) != seq);
|
||||
mmu_interval_seq_released(subscriptions, seq));
|
||||
|
||||
/* pairs with mmgrab in mmu_interval_notifier_insert() */
|
||||
mmdrop(mm);
|
||||
|
@ -632,7 +632,7 @@ done:
|
||||
*/
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
|
||||
/* Drop a reference taken by wake_oom_reaper */
|
||||
/* Drop a reference taken by queue_oom_reaper */
|
||||
put_task_struct(tsk);
|
||||
}
|
||||
|
||||
@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
|
||||
struct task_struct *tsk = NULL;
|
||||
|
||||
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
|
||||
spin_lock(&oom_reaper_lock);
|
||||
spin_lock_irq(&oom_reaper_lock);
|
||||
if (oom_reaper_list != NULL) {
|
||||
tsk = oom_reaper_list;
|
||||
oom_reaper_list = tsk->oom_reaper_list;
|
||||
}
|
||||
spin_unlock(&oom_reaper_lock);
|
||||
spin_unlock_irq(&oom_reaper_lock);
|
||||
|
||||
if (tsk)
|
||||
oom_reap_task(tsk);
|
||||
@ -658,20 +658,46 @@ static int oom_reaper(void *unused)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void wake_oom_reaper(struct task_struct *tsk)
|
||||
static void wake_oom_reaper(struct timer_list *timer)
|
||||
{
|
||||
struct task_struct *tsk = container_of(timer, struct task_struct,
|
||||
oom_reaper_timer);
|
||||
struct mm_struct *mm = tsk->signal->oom_mm;
|
||||
unsigned long flags;
|
||||
|
||||
/* The victim managed to terminate on its own - see exit_mmap */
|
||||
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
|
||||
put_task_struct(tsk);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&oom_reaper_lock, flags);
|
||||
tsk->oom_reaper_list = oom_reaper_list;
|
||||
oom_reaper_list = tsk;
|
||||
spin_unlock_irqrestore(&oom_reaper_lock, flags);
|
||||
trace_wake_reaper(tsk->pid);
|
||||
wake_up(&oom_reaper_wait);
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the OOM victim time to exit naturally before invoking the oom_reaping.
|
||||
* The timers timeout is arbitrary... the longer it is, the longer the worst
|
||||
* case scenario for the OOM can take. If it is too small, the oom_reaper can
|
||||
* get in the way and release resources needed by the process exit path.
|
||||
* e.g. The futex robust list can sit in Anon|Private memory that gets reaped
|
||||
* before the exit path is able to wake the futex waiters.
|
||||
*/
|
||||
#define OOM_REAPER_DELAY (2*HZ)
|
||||
static void queue_oom_reaper(struct task_struct *tsk)
|
||||
{
|
||||
/* mm is already queued? */
|
||||
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
|
||||
return;
|
||||
|
||||
get_task_struct(tsk);
|
||||
|
||||
spin_lock(&oom_reaper_lock);
|
||||
tsk->oom_reaper_list = oom_reaper_list;
|
||||
oom_reaper_list = tsk;
|
||||
spin_unlock(&oom_reaper_lock);
|
||||
trace_wake_reaper(tsk->pid);
|
||||
wake_up(&oom_reaper_wait);
|
||||
timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
|
||||
tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
|
||||
add_timer(&tsk->oom_reaper_timer);
|
||||
}
|
||||
|
||||
static int __init oom_init(void)
|
||||
@ -681,7 +707,7 @@ static int __init oom_init(void)
|
||||
}
|
||||
subsys_initcall(oom_init)
|
||||
#else
|
||||
static inline void wake_oom_reaper(struct task_struct *tsk)
|
||||
static inline void queue_oom_reaper(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MMU */
|
||||
@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
|
||||
rcu_read_unlock();
|
||||
|
||||
if (can_oom_reap)
|
||||
wake_oom_reaper(victim);
|
||||
queue_oom_reaper(victim);
|
||||
|
||||
mmdrop(mm);
|
||||
put_task_struct(victim);
|
||||
@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
task_lock(victim);
|
||||
if (task_will_free_mem(victim)) {
|
||||
mark_oom_victim(victim);
|
||||
wake_oom_reaper(victim);
|
||||
queue_oom_reaper(victim);
|
||||
task_unlock(victim);
|
||||
put_task_struct(victim);
|
||||
return;
|
||||
@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
|
||||
*/
|
||||
if (task_will_free_mem(current)) {
|
||||
mark_oom_victim(current);
|
||||
wake_oom_reaper(current);
|
||||
queue_oom_reaper(current);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
|
||||
_dst_pte = pte_mkdirty(_dst_pte);
|
||||
if (page_in_cache && !vm_shared)
|
||||
writable = false;
|
||||
if (writable) {
|
||||
if (wp_copy)
|
||||
_dst_pte = pte_mkuffd_wp(_dst_pte);
|
||||
else
|
||||
_dst_pte = pte_mkwrite(_dst_pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Always mark a PTE as write-protected when needed, regardless of
|
||||
* VM_WRITE, which the user might change.
|
||||
*/
|
||||
if (wp_copy)
|
||||
_dst_pte = pte_mkuffd_wp(_dst_pte);
|
||||
else if (writable)
|
||||
_dst_pte = pte_mkwrite(_dst_pte);
|
||||
|
||||
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
|
||||
|
||||
|
@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow)
|
||||
|
||||
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
mem_cgroup_flush_stats_delayed();
|
||||
/*
|
||||
* Compare the distance to the existing workingset size. We
|
||||
* don't activate pages that couldn't stay resident even if
|
||||
|
@ -6,9 +6,11 @@
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <time.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
|
||||
@ -63,6 +65,59 @@ enum {
|
||||
.expect_failure = should_fail \
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns false if the requested remap region overlaps with an
|
||||
* existing mapping (e.g text, stack) else returns true.
|
||||
*/
|
||||
static bool is_remap_region_valid(void *addr, unsigned long long size)
|
||||
{
|
||||
void *remap_addr = NULL;
|
||||
bool ret = true;
|
||||
|
||||
/* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
|
||||
remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
|
||||
MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
|
||||
-1, 0);
|
||||
|
||||
if (remap_addr == MAP_FAILED) {
|
||||
if (errno == EEXIST)
|
||||
ret = false;
|
||||
} else {
|
||||
munmap(remap_addr, size);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Returns mmap_min_addr sysctl tunable from procfs */
|
||||
static unsigned long long get_mmap_min_addr(void)
|
||||
{
|
||||
FILE *fp;
|
||||
int n_matched;
|
||||
static unsigned long long addr;
|
||||
|
||||
if (addr)
|
||||
return addr;
|
||||
|
||||
fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
|
||||
if (fp == NULL) {
|
||||
ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
|
||||
strerror(errno));
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
n_matched = fscanf(fp, "%llu", &addr);
|
||||
if (n_matched != 1) {
|
||||
ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
|
||||
strerror(errno));
|
||||
fclose(fp);
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
return addr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the start address of the mapping on success, else returns
|
||||
* NULL on failure.
|
||||
@ -71,11 +126,18 @@ static void *get_source_mapping(struct config c)
|
||||
{
|
||||
unsigned long long addr = 0ULL;
|
||||
void *src_addr = NULL;
|
||||
unsigned long long mmap_min_addr;
|
||||
|
||||
mmap_min_addr = get_mmap_min_addr();
|
||||
|
||||
retry:
|
||||
addr += c.src_alignment;
|
||||
if (addr < mmap_min_addr)
|
||||
goto retry;
|
||||
|
||||
src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
|
||||
MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
|
||||
-1, 0);
|
||||
MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
|
||||
-1, 0);
|
||||
if (src_addr == MAP_FAILED) {
|
||||
if (errno == EPERM || errno == EEXIST)
|
||||
goto retry;
|
||||
@ -90,8 +152,10 @@ retry:
|
||||
* alignment in the tests.
|
||||
*/
|
||||
if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
|
||||
!((unsigned long long) src_addr & c.src_alignment))
|
||||
!((unsigned long long) src_addr & c.src_alignment)) {
|
||||
munmap(src_addr, c.region_size);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (!src_addr)
|
||||
goto error;
|
||||
@ -140,9 +204,20 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
|
||||
if (!((unsigned long long) addr & c.dest_alignment))
|
||||
addr = (void *) ((unsigned long long) addr | c.dest_alignment);
|
||||
|
||||
/* Don't destroy existing mappings unless expected to overlap */
|
||||
while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
|
||||
/* Check for unsigned overflow */
|
||||
if (addr + c.dest_alignment < addr) {
|
||||
ksft_print_msg("Couldn't find a valid region to remap to\n");
|
||||
ret = -1;
|
||||
goto out;
|
||||
}
|
||||
addr += c.dest_alignment;
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t_start);
|
||||
dest_addr = mremap(src_addr, c.region_size, c.region_size,
|
||||
MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
|
||||
MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
|
||||
clock_gettime(CLOCK_MONOTONIC, &t_end);
|
||||
|
||||
if (dest_addr == MAP_FAILED) {
|
||||
@ -193,7 +268,7 @@ static void run_mremap_test_case(struct test test_case, int *failures,
|
||||
|
||||
if (remap_time < 0) {
|
||||
if (test_case.expect_failure)
|
||||
ksft_test_result_pass("%s\n\tExpected mremap failure\n",
|
||||
ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
|
||||
test_case.name);
|
||||
else {
|
||||
ksft_test_result_fail("%s\n", test_case.name);
|
||||
|
@ -291,11 +291,16 @@ echo "-------------------"
|
||||
echo "running mremap_test"
|
||||
echo "-------------------"
|
||||
./mremap_test
|
||||
if [ $? -ne 0 ]; then
|
||||
ret_val=$?
|
||||
|
||||
if [ $ret_val -eq 0 ]; then
|
||||
echo "[PASS]"
|
||||
elif [ $ret_val -eq $ksft_skip ]; then
|
||||
echo "[SKIP]"
|
||||
exitcode=$ksft_skip
|
||||
else
|
||||
echo "[FAIL]"
|
||||
exitcode=1
|
||||
else
|
||||
echo "[PASS]"
|
||||
fi
|
||||
|
||||
echo "-----------------"
|
||||
|
Loading…
Reference in New Issue
Block a user