mirror of
https://github.com/torvalds/linux.git
synced 2024-11-21 19:41:42 +00:00
20 hotfixes, 14 of which are cc:stable.
Three affect DAMON. Lorenzo's five-patch series to address the mmap_region error handling is here also. Apart from that, various singletons. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZzBVmAAKCRDdBJ7gKXxA ju42AQD0EEnzW+zFyI+E7x5FwCmLL6ofmzM8Sw9YrKjaeShdZgEAhcyS2Rc/AaJq Uty2ZvVMDF2a9p9gqHfKKARBXEbN2w0= =n+lO -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2024-11-09-22-40' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull misc fixes from Andrew Morton: "20 hotfixes, 14 of which are cc:stable. Three affect DAMON. Lorenzo's five-patch series to address the mmap_region error handling is here also. Apart from that, various singletons" * tag 'mm-hotfixes-stable-2024-11-09-22-40' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: mailmap: add entry for Thorsten Blum ocfs2: remove entry once instead of null-ptr-dereference in ocfs2_xa_remove() signal: restore the override_rlimit logic fs/proc: fix compile warning about variable 'vmcore_mmap_ops' ucounts: fix counter leak in inc_rlimit_get_ucounts() selftests: hugetlb_dio: check for initial conditions to skip in the start mm: fix docs for the kernel parameter ``thp_anon=`` mm/damon/core: avoid overflow in damon_feed_loop_next_input() mm/damon/core: handle zero schemes apply interval mm/damon/core: handle zero {aggregation,ops_update} intervals mm/mlock: set the correct prev on failure objpool: fix to make percpu slot allocation more robust mm/page_alloc: keep track of free highatomic mm: resolve faulty mmap_region() error path behaviour mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling mm: refactor map_deny_write_exec() mm: unconditionally close VMAs on error mm: avoid unsafe VMA hook invocation when error arises on mmap hook mm/thp: fix deferred split unqueue naming and locking mm/thp: fix deferred split queue not partially_mapped
This commit is contained in:
commit
28e43197c4
1
.mailmap
1
.mailmap
@ -665,6 +665,7 @@ Tomeu Vizoso <tomeu@tomeuvizoso.net> <tomeu.vizoso@collabora.com>
|
||||
Thomas Graf <tgraf@suug.ch>
|
||||
Thomas Körper <socketcan@esd.eu> <thomas.koerper@esd.eu>
|
||||
Thomas Pedersen <twp@codeaurora.org>
|
||||
Thorsten Blum <thorsten.blum@linux.dev> <thorsten.blum@toblux.com>
|
||||
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
|
||||
Tingwei Zhang <quic_tingwei@quicinc.com> <tingwei@codeaurora.org>
|
||||
Tirupathi Reddy <quic_tirupath@quicinc.com> <tirupath@codeaurora.org>
|
||||
|
@ -6688,7 +6688,7 @@
|
||||
0: no polling (default)
|
||||
|
||||
thp_anon= [KNL]
|
||||
Format: <size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>
|
||||
Format: <size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>
|
||||
state is one of "always", "madvise", "never" or "inherit".
|
||||
Control the default behavior of the system with respect
|
||||
to anonymous transparent hugepages.
|
||||
|
@ -303,7 +303,7 @@ control by passing the parameter ``transparent_hugepage=always`` or
|
||||
kernel command line.
|
||||
|
||||
Alternatively, each supported anonymous THP size can be controlled by
|
||||
passing ``thp_anon=<size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>``,
|
||||
passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
|
||||
where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
|
||||
supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
|
||||
``never`` or ``inherit``.
|
||||
|
@ -6,6 +6,8 @@
|
||||
|
||||
#ifndef BUILD_VDSO
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
|
||||
@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
|
||||
}
|
||||
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
|
||||
|
||||
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
|
||||
static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
|
||||
unsigned long flags)
|
||||
{
|
||||
/*
|
||||
* Only allow MTE on anonymous mappings as these are guaranteed to be
|
||||
* backed by tags-capable memory. The vm_flags may be overridden by a
|
||||
* filesystem supporting MTE (RAM-based).
|
||||
*/
|
||||
if (system_supports_mte() && (flags & MAP_ANONYMOUS))
|
||||
if (system_supports_mte() &&
|
||||
((flags & MAP_ANONYMOUS) || shmem_file(file)))
|
||||
return VM_MTE_ALLOWED;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
|
||||
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
|
||||
|
||||
static inline bool arch_validate_prot(unsigned long prot,
|
||||
unsigned long addr __always_unused)
|
||||
|
@ -2,6 +2,7 @@
|
||||
#ifndef __ASM_MMAN_H__
|
||||
#define __ASM_MMAN_H__
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <uapi/asm/mman.h>
|
||||
|
||||
/* PARISC cannot allow mdwe as it needs writable stacks */
|
||||
@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
|
||||
}
|
||||
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
|
||||
|
||||
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
|
||||
static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
|
||||
{
|
||||
/*
|
||||
* The stack on parisc grows upwards, so if userspace requests memory
|
||||
@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
|
||||
|
||||
return 0;
|
||||
}
|
||||
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
|
||||
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
|
||||
|
||||
#endif /* __ASM_MMAN_H__ */
|
||||
|
@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
|
||||
rc = 0;
|
||||
ocfs2_xa_cleanup_value_truncate(loc, "removing",
|
||||
orig_clusters);
|
||||
if (rc)
|
||||
goto out;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
|
||||
#endif
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct vmcore_mmap_ops = {
|
||||
.fault = mmap_vmcore_fault,
|
||||
};
|
||||
|
||||
/**
|
||||
* vmcore_alloc_buf - allocate buffer in vmalloc memory
|
||||
* @size: size of buffer
|
||||
@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size)
|
||||
* virtually contiguous user-space in ELF layout.
|
||||
*/
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
static const struct vm_operations_struct vmcore_mmap_ops = {
|
||||
.fault = mmap_vmcore_fault,
|
||||
};
|
||||
|
||||
/*
|
||||
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
|
||||
* reported as not being ram with the zero page.
|
||||
|
@ -2,6 +2,7 @@
|
||||
#ifndef _LINUX_MMAN_H
|
||||
#define _LINUX_MMAN_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
|
||||
@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
|
||||
#endif
|
||||
|
||||
#ifndef arch_calc_vm_flag_bits
|
||||
#define arch_calc_vm_flag_bits(flags) 0
|
||||
#define arch_calc_vm_flag_bits(file, flags) 0
|
||||
#endif
|
||||
|
||||
#ifndef arch_validate_prot
|
||||
@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
|
||||
* Combine the mmap "flags" argument into "vm_flags" used internally.
|
||||
*/
|
||||
static inline unsigned long
|
||||
calc_vm_flag_bits(unsigned long flags)
|
||||
calc_vm_flag_bits(struct file *file, unsigned long flags)
|
||||
{
|
||||
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
||||
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
|
||||
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
|
||||
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
|
||||
arch_calc_vm_flag_bits(flags);
|
||||
arch_calc_vm_flag_bits(file, flags);
|
||||
}
|
||||
|
||||
unsigned long vm_commit_limit(void);
|
||||
@ -188,16 +189,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
|
||||
*
|
||||
* d) mmap(PROT_READ | PROT_EXEC)
|
||||
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
|
||||
*
|
||||
* This is only applicable if the user has set the Memory-Deny-Write-Execute
|
||||
* (MDWE) protection mask for the current process.
|
||||
*
|
||||
* @old specifies the VMA flags the VMA originally possessed, and @new the ones
|
||||
* we propose to set.
|
||||
*
|
||||
* Return: false if proposed change is OK, true if not ok and should be denied.
|
||||
*/
|
||||
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
|
||||
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
|
||||
{
|
||||
/* If MDWE is disabled, we have nothing to deny. */
|
||||
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
|
||||
return false;
|
||||
|
||||
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
|
||||
/* If the new VMA is not executable, we have nothing to deny. */
|
||||
if (!(new & VM_EXEC))
|
||||
return false;
|
||||
|
||||
/* Under MDWE we do not accept newly writably executable VMAs... */
|
||||
if (new & VM_WRITE)
|
||||
return true;
|
||||
|
||||
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
|
||||
/* ...nor previously non-executable VMAs becoming executable. */
|
||||
if (!(old & VM_EXEC))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -823,6 +823,7 @@ struct zone {
|
||||
unsigned long watermark_boost;
|
||||
|
||||
unsigned long nr_reserved_highatomic;
|
||||
unsigned long nr_free_highatomic;
|
||||
|
||||
/*
|
||||
* We don't know if the memory that we're going to allocate will be
|
||||
|
@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty
|
||||
|
||||
long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
|
||||
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
|
||||
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
|
||||
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
|
||||
bool override_rlimit);
|
||||
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
|
||||
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);
|
||||
|
||||
|
@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
|
||||
*/
|
||||
rcu_read_lock();
|
||||
ucounts = task_ucounts(t);
|
||||
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
|
||||
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
|
||||
override_rlimit);
|
||||
rcu_read_unlock();
|
||||
if (!sigpending)
|
||||
return NULL;
|
||||
|
@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
|
||||
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
|
||||
}
|
||||
|
||||
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
|
||||
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
|
||||
bool override_rlimit)
|
||||
{
|
||||
/* Caller must hold a reference to ucounts */
|
||||
struct ucounts *iter;
|
||||
@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
|
||||
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
||||
long new = atomic_long_add_return(1, &iter->rlimit[type]);
|
||||
if (new < 0 || new > max)
|
||||
goto unwind;
|
||||
goto dec_unwind;
|
||||
if (iter == ucounts)
|
||||
ret = new;
|
||||
max = get_userns_rlimit_max(iter->ns, type);
|
||||
if (!override_rlimit)
|
||||
max = get_userns_rlimit_max(iter->ns, type);
|
||||
/*
|
||||
* Grab an extra ucount reference for the caller when
|
||||
* the rlimit count was previously 0.
|
||||
@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
|
||||
dec_unwind:
|
||||
dec = atomic_long_sub_return(1, &iter->rlimit[type]);
|
||||
WARN_ON_ONCE(dec < 0);
|
||||
unwind:
|
||||
do_dec_rlimit_put_ucounts(ucounts, iter, type);
|
||||
return 0;
|
||||
}
|
||||
|
@ -74,15 +74,21 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
|
||||
* warm caches and TLB hits. in default vmalloc is used to
|
||||
* reduce the pressure of kernel slab system. as we know,
|
||||
* mimimal size of vmalloc is one page since vmalloc would
|
||||
* always align the requested size to page size
|
||||
* always align the requested size to page size.
|
||||
* but if vmalloc fails or it is not available (e.g. GFP_ATOMIC)
|
||||
* allocate percpu slot with kmalloc.
|
||||
*/
|
||||
if ((pool->gfp & GFP_ATOMIC) == GFP_ATOMIC)
|
||||
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
|
||||
else
|
||||
slot = NULL;
|
||||
|
||||
if ((pool->gfp & (GFP_ATOMIC | GFP_KERNEL)) != GFP_ATOMIC)
|
||||
slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
|
||||
cpu_to_node(i), __builtin_return_address(0));
|
||||
if (!slot)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!slot) {
|
||||
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
|
||||
if (!slot)
|
||||
return -ENOMEM;
|
||||
}
|
||||
memset(slot, 0, size);
|
||||
pool->cpu_slots[i] = slot;
|
||||
|
||||
|
@ -1412,7 +1412,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
|
||||
damon_for_each_scheme(s, c) {
|
||||
struct damos_quota *quota = &s->quota;
|
||||
|
||||
if (c->passed_sample_intervals != s->next_apply_sis)
|
||||
if (c->passed_sample_intervals < s->next_apply_sis)
|
||||
continue;
|
||||
|
||||
if (!s->wmarks.activated)
|
||||
@ -1456,17 +1456,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
|
||||
unsigned long score)
|
||||
{
|
||||
const unsigned long goal = 10000;
|
||||
unsigned long score_goal_diff = max(goal, score) - min(goal, score);
|
||||
unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal;
|
||||
unsigned long compensation = last_input * score_goal_diff_bp / 10000;
|
||||
/* Set minimum input as 10000 to avoid compensation be zero */
|
||||
const unsigned long min_input = 10000;
|
||||
unsigned long score_goal_diff, compensation;
|
||||
bool over_achieving = score > goal;
|
||||
|
||||
if (goal > score)
|
||||
if (score == goal)
|
||||
return last_input;
|
||||
if (score >= goal * 2)
|
||||
return min_input;
|
||||
|
||||
if (over_achieving)
|
||||
score_goal_diff = score - goal;
|
||||
else
|
||||
score_goal_diff = goal - score;
|
||||
|
||||
if (last_input < ULONG_MAX / score_goal_diff)
|
||||
compensation = last_input * score_goal_diff / goal;
|
||||
else
|
||||
compensation = last_input / goal * score_goal_diff;
|
||||
|
||||
if (over_achieving)
|
||||
return max(last_input - compensation, min_input);
|
||||
if (last_input < ULONG_MAX - compensation)
|
||||
return last_input + compensation;
|
||||
if (last_input > compensation + min_input)
|
||||
return last_input - compensation;
|
||||
return min_input;
|
||||
return ULONG_MAX;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PSI
|
||||
@ -1622,7 +1636,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
|
||||
bool has_schemes_to_apply = false;
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
if (c->passed_sample_intervals != s->next_apply_sis)
|
||||
if (c->passed_sample_intervals < s->next_apply_sis)
|
||||
continue;
|
||||
|
||||
if (!s->wmarks.activated)
|
||||
@ -1642,9 +1656,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
|
||||
}
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
if (c->passed_sample_intervals != s->next_apply_sis)
|
||||
if (c->passed_sample_intervals < s->next_apply_sis)
|
||||
continue;
|
||||
s->next_apply_sis +=
|
||||
s->next_apply_sis = c->passed_sample_intervals +
|
||||
(s->apply_interval_us ? s->apply_interval_us :
|
||||
c->attrs.aggr_interval) / sample_interval;
|
||||
}
|
||||
@ -2000,7 +2014,7 @@ static int kdamond_fn(void *data)
|
||||
if (ctx->ops.check_accesses)
|
||||
max_nr_accesses = ctx->ops.check_accesses(ctx);
|
||||
|
||||
if (ctx->passed_sample_intervals == next_aggregation_sis) {
|
||||
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
|
||||
kdamond_merge_regions(ctx,
|
||||
max_nr_accesses / 10,
|
||||
sz_limit);
|
||||
@ -2018,7 +2032,7 @@ static int kdamond_fn(void *data)
|
||||
|
||||
sample_interval = ctx->attrs.sample_interval ?
|
||||
ctx->attrs.sample_interval : 1;
|
||||
if (ctx->passed_sample_intervals == next_aggregation_sis) {
|
||||
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
|
||||
ctx->next_aggregation_sis = next_aggregation_sis +
|
||||
ctx->attrs.aggr_interval / sample_interval;
|
||||
|
||||
@ -2028,7 +2042,7 @@ static int kdamond_fn(void *data)
|
||||
ctx->ops.reset_aggregated(ctx);
|
||||
}
|
||||
|
||||
if (ctx->passed_sample_intervals == next_ops_update_sis) {
|
||||
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
|
||||
ctx->next_ops_update_sis = next_ops_update_sis +
|
||||
ctx->attrs.ops_update_interval /
|
||||
sample_interval;
|
||||
|
@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
|
||||
return split_huge_page_to_list_to_order(&folio->page, list, ret);
|
||||
}
|
||||
|
||||
void __folio_undo_large_rmappable(struct folio *folio)
|
||||
/*
|
||||
* __folio_unqueue_deferred_split() is not to be called directly:
|
||||
* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
|
||||
* limits its calls to those folios which may have a _deferred_list for
|
||||
* queueing THP splits, and that list is (racily observed to be) non-empty.
|
||||
*
|
||||
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
|
||||
* zero: because even when split_queue_lock is held, a non-empty _deferred_list
|
||||
* might be in use on deferred_split_scan()'s unlocked on-stack list.
|
||||
*
|
||||
* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
|
||||
* therefore important to unqueue deferred split before changing folio memcg.
|
||||
*/
|
||||
bool __folio_unqueue_deferred_split(struct folio *folio)
|
||||
{
|
||||
struct deferred_split *ds_queue;
|
||||
unsigned long flags;
|
||||
bool unqueued = false;
|
||||
|
||||
WARN_ON_ONCE(folio_ref_count(folio));
|
||||
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
|
||||
|
||||
ds_queue = get_deferred_split_queue(folio);
|
||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||
@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
|
||||
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
|
||||
}
|
||||
list_del_init(&folio->_deferred_list);
|
||||
unqueued = true;
|
||||
}
|
||||
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
|
||||
|
||||
return unqueued; /* useful for debug warnings */
|
||||
}
|
||||
|
||||
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
|
||||
@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
|
||||
return;
|
||||
|
||||
/*
|
||||
* The try_to_unmap() in page reclaim path might reach here too,
|
||||
* this may cause a race condition to corrupt deferred split queue.
|
||||
* And, if page reclaim is already handling the same folio, it is
|
||||
* unnecessary to handle it again in shrinker.
|
||||
*
|
||||
* Check the swapcache flag to determine if the folio is being
|
||||
* handled by page reclaim since THP swap would add the folio into
|
||||
* swap cache before calling try_to_unmap().
|
||||
* Exclude swapcache: originally to avoid a corrupt deferred split
|
||||
* queue. Nowadays that is fully prevented by mem_cgroup_swapout();
|
||||
* but if page reclaim is already handling the same folio, it is
|
||||
* unnecessary to handle it again in the shrinker, so excluding
|
||||
* swapcache here may still be a useful optimization.
|
||||
*/
|
||||
if (folio_test_swapcache(folio))
|
||||
return;
|
||||
@ -3718,8 +3735,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
|
||||
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
|
||||
unsigned long flags;
|
||||
LIST_HEAD(list);
|
||||
struct folio *folio, *next;
|
||||
int split = 0;
|
||||
struct folio *folio, *next, *prev = NULL;
|
||||
int split = 0, removed = 0;
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (sc->memcg)
|
||||
@ -3775,15 +3792,28 @@ next:
|
||||
*/
|
||||
if (!did_split && !folio_test_partially_mapped(folio)) {
|
||||
list_del_init(&folio->_deferred_list);
|
||||
ds_queue->split_queue_len--;
|
||||
removed++;
|
||||
} else {
|
||||
/*
|
||||
* That unlocked list_del_init() above would be unsafe,
|
||||
* unless its folio is separated from any earlier folios
|
||||
* left on the list (which may be concurrently unqueued)
|
||||
* by one safe folio with refcount still raised.
|
||||
*/
|
||||
swap(folio, prev);
|
||||
}
|
||||
folio_put(folio);
|
||||
if (folio)
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||
list_splice_tail(&list, &ds_queue->split_queue);
|
||||
ds_queue->split_queue_len -= removed;
|
||||
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
|
||||
|
||||
if (prev)
|
||||
folio_put(prev);
|
||||
|
||||
/*
|
||||
* Stop shrinker if we didn't split any page, but the queue is empty.
|
||||
* This can happen if pages were freed under us.
|
||||
|
@ -108,6 +108,51 @@ static inline void *folio_raw_mapping(const struct folio *folio)
|
||||
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a file-backed mapping, and is about to be memory mapped - invoke its
|
||||
* mmap hook and safely handle error conditions. On error, VMA hooks will be
|
||||
* mutated.
|
||||
*
|
||||
* @file: File which backs the mapping.
|
||||
* @vma: VMA which we are mapping.
|
||||
*
|
||||
* Returns: 0 if success, error otherwise.
|
||||
*/
|
||||
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
int err = call_mmap(file, vma);
|
||||
|
||||
if (likely(!err))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* OK, we tried to call the file hook for mmap(), but an error
|
||||
* arose. The mapping is in an inconsistent state and we most not invoke
|
||||
* any further hooks on it.
|
||||
*/
|
||||
vma->vm_ops = &vma_dummy_vm_ops;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the VMA has a close hook then close it, and since closing it might leave
|
||||
* it in an inconsistent state which makes the use of any hooks suspect, clear
|
||||
* them down by installing dummy empty hooks.
|
||||
*/
|
||||
static inline void vma_close(struct vm_area_struct *vma)
|
||||
{
|
||||
if (vma->vm_ops && vma->vm_ops->close) {
|
||||
vma->vm_ops->close(vma);
|
||||
|
||||
/*
|
||||
* The mapping is in an inconsistent state, and no further hooks
|
||||
* may be invoked upon it.
|
||||
*/
|
||||
vma->vm_ops = &vma_dummy_vm_ops;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
/* Flags for folio_pte_batch(). */
|
||||
@ -639,11 +684,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
|
||||
#endif
|
||||
}
|
||||
|
||||
void __folio_undo_large_rmappable(struct folio *folio);
|
||||
static inline void folio_undo_large_rmappable(struct folio *folio)
|
||||
bool __folio_unqueue_deferred_split(struct folio *folio);
|
||||
static inline bool folio_unqueue_deferred_split(struct folio *folio)
|
||||
{
|
||||
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
|
||||
return;
|
||||
return false;
|
||||
|
||||
/*
|
||||
* At this point, there is no one trying to add the folio to
|
||||
@ -651,9 +696,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
|
||||
* to check without acquiring the split_queue_lock.
|
||||
*/
|
||||
if (data_race(list_empty(&folio->_deferred_list)))
|
||||
return;
|
||||
return false;
|
||||
|
||||
__folio_undo_large_rmappable(folio);
|
||||
return __folio_unqueue_deferred_split(folio);
|
||||
}
|
||||
|
||||
static inline struct folio *page_rmappable_folio(struct page *page)
|
||||
|
@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
|
||||
css_get(&to->css);
|
||||
css_put(&from->css);
|
||||
|
||||
/* Warning should never happen, so don't worry about refcount non-0 */
|
||||
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
|
||||
folio->memcg_data = (unsigned long)to;
|
||||
|
||||
__folio_memcg_unlock(from);
|
||||
@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
||||
enum mc_target_type target_type;
|
||||
union mc_target target;
|
||||
struct folio *folio;
|
||||
bool tried_split_before = false;
|
||||
|
||||
retry_pmd:
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (ptl) {
|
||||
if (mc.precharge < HPAGE_PMD_NR) {
|
||||
@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
||||
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
|
||||
if (target_type == MC_TARGET_PAGE) {
|
||||
folio = target.folio;
|
||||
/*
|
||||
* Deferred split queue locking depends on memcg,
|
||||
* and unqueue is unsafe unless folio refcount is 0:
|
||||
* split or skip if on the queue? first try to split.
|
||||
*/
|
||||
if (!list_empty(&folio->_deferred_list)) {
|
||||
spin_unlock(ptl);
|
||||
if (!tried_split_before)
|
||||
split_folio(folio);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
if (tried_split_before)
|
||||
return 0;
|
||||
tried_split_before = true;
|
||||
goto retry_pmd;
|
||||
}
|
||||
/*
|
||||
* So long as that pmd lock is held, the folio cannot
|
||||
* be racily added to the _deferred_list, because
|
||||
* __folio_remove_rmap() will find !partially_mapped.
|
||||
*/
|
||||
if (folio_isolate_lru(folio)) {
|
||||
if (!mem_cgroup_move_account(folio, true,
|
||||
mc.from, mc.to)) {
|
||||
|
@ -4629,10 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
|
||||
struct obj_cgroup *objcg;
|
||||
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
|
||||
!folio_test_hugetlb(folio) &&
|
||||
!list_empty(&folio->_deferred_list) &&
|
||||
folio_test_partially_mapped(folio), folio);
|
||||
|
||||
/*
|
||||
* Nobody should be changing or seriously looking at
|
||||
@ -4679,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
|
||||
ug->nr_memory += nr_pages;
|
||||
ug->pgpgout++;
|
||||
|
||||
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
|
||||
folio->memcg_data = 0;
|
||||
}
|
||||
|
||||
@ -4790,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
|
||||
|
||||
/* Transfer the charge and the css ref */
|
||||
commit_charge(new, memcg);
|
||||
|
||||
/* Warning should never happen, so don't worry about refcount non-0 */
|
||||
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
|
||||
old->memcg_data = 0;
|
||||
}
|
||||
|
||||
@ -4976,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
|
||||
VM_BUG_ON_FOLIO(oldid, folio);
|
||||
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
|
||||
|
||||
folio_unqueue_deferred_split(folio);
|
||||
folio->memcg_data = 0;
|
||||
|
||||
if (!mem_cgroup_is_root(memcg))
|
||||
|
@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
|
||||
folio_test_large_rmappable(folio)) {
|
||||
if (!folio_ref_freeze(folio, expected_count))
|
||||
return -EAGAIN;
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
folio_ref_unfreeze(folio, expected_count);
|
||||
}
|
||||
|
||||
@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
|
||||
}
|
||||
|
||||
/* Take off deferred split queue while frozen and memcg set */
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
|
||||
/*
|
||||
* Now we know that no one else is looking at the folio:
|
||||
|
@ -725,14 +725,17 @@ static int apply_mlockall_flags(int flags)
|
||||
}
|
||||
|
||||
for_each_vma(vmi, vma) {
|
||||
int error;
|
||||
vm_flags_t newflags;
|
||||
|
||||
newflags = vma->vm_flags & ~VM_LOCKED_MASK;
|
||||
newflags |= to_add;
|
||||
|
||||
/* Ignore errors */
|
||||
mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
|
||||
newflags);
|
||||
error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
|
||||
newflags);
|
||||
/* Ignore errors, but prev needs fixing up. */
|
||||
if (error)
|
||||
prev = vma;
|
||||
cond_resched();
|
||||
}
|
||||
out:
|
||||
|
130
mm/mmap.c
130
mm/mmap.c
@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
|
||||
* to. we assume access permissions have been handled by the open
|
||||
* of the memory object, so we don't do any here.
|
||||
*/
|
||||
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
|
||||
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
|
||||
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
|
||||
|
||||
/* Obtain the address to map to. we verify (or select) it and ensure
|
||||
@ -1358,20 +1358,18 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
|
||||
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
|
||||
}
|
||||
|
||||
unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
static unsigned long __mmap_region(struct file *file, unsigned long addr,
|
||||
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
|
||||
struct list_head *uf)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma = NULL;
|
||||
pgoff_t pglen = PHYS_PFN(len);
|
||||
struct vm_area_struct *merge;
|
||||
unsigned long charged = 0;
|
||||
struct vma_munmap_struct vms;
|
||||
struct ma_state mas_detach;
|
||||
struct maple_tree mt_detach;
|
||||
unsigned long end = addr + len;
|
||||
bool writable_file_mapping = false;
|
||||
int error;
|
||||
VMA_ITERATOR(vmi, mm, addr);
|
||||
VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
|
||||
@ -1422,7 +1420,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
/*
|
||||
* clear PTEs while the vma is still in the tree so that rmap
|
||||
* cannot race with the freeing later in the truncate scenario.
|
||||
* This is also needed for call_mmap(), which is why vm_ops
|
||||
* This is also needed for mmap_file(), which is why vm_ops
|
||||
* close function is called.
|
||||
*/
|
||||
vms_clean_up_area(&vms, &mas_detach);
|
||||
@ -1445,35 +1443,35 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
vm_flags_init(vma, vm_flags);
|
||||
vma->vm_page_prot = vm_get_page_prot(vm_flags);
|
||||
|
||||
if (vma_iter_prealloc(&vmi, vma)) {
|
||||
error = -ENOMEM;
|
||||
goto free_vma;
|
||||
}
|
||||
|
||||
if (file) {
|
||||
vma->vm_file = get_file(file);
|
||||
error = call_mmap(file, vma);
|
||||
error = mmap_file(file, vma);
|
||||
if (error)
|
||||
goto unmap_and_free_vma;
|
||||
|
||||
if (vma_is_shared_maywrite(vma)) {
|
||||
error = mapping_map_writable(file->f_mapping);
|
||||
if (error)
|
||||
goto close_and_free_vma;
|
||||
|
||||
writable_file_mapping = true;
|
||||
}
|
||||
goto unmap_and_free_file_vma;
|
||||
|
||||
/* Drivers cannot alter the address of the VMA. */
|
||||
WARN_ON_ONCE(addr != vma->vm_start);
|
||||
/*
|
||||
* Expansion is handled above, merging is handled below.
|
||||
* Drivers should not alter the address of the VMA.
|
||||
* Drivers should not permit writability when previously it was
|
||||
* disallowed.
|
||||
*/
|
||||
if (WARN_ON((addr != vma->vm_start))) {
|
||||
error = -EINVAL;
|
||||
goto close_and_free_vma;
|
||||
}
|
||||
VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
|
||||
!(vm_flags & VM_MAYWRITE) &&
|
||||
(vma->vm_flags & VM_MAYWRITE));
|
||||
|
||||
vma_iter_config(&vmi, addr, end);
|
||||
/*
|
||||
* If vm_flags changed after call_mmap(), we should try merge
|
||||
* If vm_flags changed after mmap_file(), we should try merge
|
||||
* vma again as we may succeed this time.
|
||||
*/
|
||||
if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
|
||||
struct vm_area_struct *merge;
|
||||
|
||||
vmg.flags = vma->vm_flags;
|
||||
/* If this fails, state is reset ready for a reattempt. */
|
||||
merge = vma_merge_new_range(&vmg);
|
||||
@ -1491,7 +1489,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
vma = merge;
|
||||
/* Update vm_flags to pick up the change. */
|
||||
vm_flags = vma->vm_flags;
|
||||
goto unmap_writable;
|
||||
goto file_expanded;
|
||||
}
|
||||
vma_iter_config(&vmi, addr, end);
|
||||
}
|
||||
@ -1500,26 +1498,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
} else if (vm_flags & VM_SHARED) {
|
||||
error = shmem_zero_setup(vma);
|
||||
if (error)
|
||||
goto free_vma;
|
||||
goto free_iter_vma;
|
||||
} else {
|
||||
vma_set_anonymous(vma);
|
||||
}
|
||||
|
||||
if (map_deny_write_exec(vma, vma->vm_flags)) {
|
||||
error = -EACCES;
|
||||
goto close_and_free_vma;
|
||||
}
|
||||
|
||||
/* Allow architectures to sanity-check the vm_flags */
|
||||
if (!arch_validate_flags(vma->vm_flags)) {
|
||||
error = -EINVAL;
|
||||
goto close_and_free_vma;
|
||||
}
|
||||
|
||||
if (vma_iter_prealloc(&vmi, vma)) {
|
||||
error = -ENOMEM;
|
||||
goto close_and_free_vma;
|
||||
}
|
||||
#ifdef CONFIG_SPARC64
|
||||
/* TODO: Fix SPARC ADI! */
|
||||
WARN_ON_ONCE(!arch_validate_flags(vm_flags));
|
||||
#endif
|
||||
|
||||
/* Lock the VMA since it is modified after insertion into VMA tree */
|
||||
vma_start_write(vma);
|
||||
@ -1533,10 +1520,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
*/
|
||||
khugepaged_enter_vma(vma, vma->vm_flags);
|
||||
|
||||
/* Once vma denies write, undo our temporary denial count */
|
||||
unmap_writable:
|
||||
if (writable_file_mapping)
|
||||
mapping_unmap_writable(file->f_mapping);
|
||||
file_expanded:
|
||||
file = vma->vm_file;
|
||||
ksm_add_vma(vma);
|
||||
expanded:
|
||||
@ -1569,24 +1553,17 @@ expanded:
|
||||
|
||||
vma_set_page_prot(vma);
|
||||
|
||||
validate_mm(mm);
|
||||
return addr;
|
||||
|
||||
close_and_free_vma:
|
||||
if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
|
||||
vma->vm_ops->close(vma);
|
||||
unmap_and_free_file_vma:
|
||||
fput(vma->vm_file);
|
||||
vma->vm_file = NULL;
|
||||
|
||||
if (file || vma->vm_file) {
|
||||
unmap_and_free_vma:
|
||||
fput(vma->vm_file);
|
||||
vma->vm_file = NULL;
|
||||
|
||||
vma_iter_set(&vmi, vma->vm_end);
|
||||
/* Undo any partial mapping done by a device driver. */
|
||||
unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
|
||||
}
|
||||
if (writable_file_mapping)
|
||||
mapping_unmap_writable(file->f_mapping);
|
||||
vma_iter_set(&vmi, vma->vm_end);
|
||||
/* Undo any partial mapping done by a device driver. */
|
||||
unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
|
||||
free_iter_vma:
|
||||
vma_iter_free(&vmi);
|
||||
free_vma:
|
||||
vm_area_free(vma);
|
||||
unacct_error:
|
||||
@ -1596,10 +1573,43 @@ unacct_error:
|
||||
abort_munmap:
|
||||
vms_abort_munmap_vmas(&vms, &mas_detach);
|
||||
gather_failed:
|
||||
validate_mm(mm);
|
||||
return error;
|
||||
}
|
||||
|
||||
unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
|
||||
struct list_head *uf)
|
||||
{
|
||||
unsigned long ret;
|
||||
bool writable_file_mapping = false;
|
||||
|
||||
/* Check to see if MDWE is applicable. */
|
||||
if (map_deny_write_exec(vm_flags, vm_flags))
|
||||
return -EACCES;
|
||||
|
||||
/* Allow architectures to sanity-check the vm_flags. */
|
||||
if (!arch_validate_flags(vm_flags))
|
||||
return -EINVAL;
|
||||
|
||||
/* Map writable and ensure this isn't a sealed memfd. */
|
||||
if (file && is_shared_maywrite(vm_flags)) {
|
||||
int error = mapping_map_writable(file->f_mapping);
|
||||
|
||||
if (error)
|
||||
return error;
|
||||
writable_file_mapping = true;
|
||||
}
|
||||
|
||||
ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
|
||||
|
||||
/* Clear our write mapping regardless of error. */
|
||||
if (writable_file_mapping)
|
||||
mapping_unmap_writable(file->f_mapping);
|
||||
|
||||
validate_mm(current->mm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __vm_munmap(unsigned long start, size_t len, bool unlock)
|
||||
{
|
||||
int ret;
|
||||
@ -1934,7 +1944,7 @@ void exit_mmap(struct mm_struct *mm)
|
||||
do {
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += vma_pages(vma);
|
||||
remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
|
||||
remove_vma(vma, /* unreachable = */ true);
|
||||
count++;
|
||||
cond_resched();
|
||||
vma = vma_next(&vmi);
|
||||
|
@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
|
||||
break;
|
||||
}
|
||||
|
||||
if (map_deny_write_exec(vma, newflags)) {
|
||||
if (map_deny_write_exec(vma->vm_flags, newflags)) {
|
||||
error = -EACCES;
|
||||
break;
|
||||
}
|
||||
|
@ -589,8 +589,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
|
||||
*/
|
||||
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||
{
|
||||
if (vma->vm_ops && vma->vm_ops->close)
|
||||
vma->vm_ops->close(vma);
|
||||
vma_close(vma);
|
||||
if (vma->vm_file)
|
||||
fput(vma->vm_file);
|
||||
put_nommu_region(vma->vm_region);
|
||||
@ -843,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file,
|
||||
{
|
||||
unsigned long vm_flags;
|
||||
|
||||
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
|
||||
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
|
||||
|
||||
if (!file) {
|
||||
/*
|
||||
@ -885,7 +884,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = call_mmap(vma->vm_file, vma);
|
||||
ret = mmap_file(vma->vm_file, vma);
|
||||
if (ret == 0) {
|
||||
vma->vm_region->vm_top = vma->vm_region->vm_end;
|
||||
return 0;
|
||||
@ -918,7 +917,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
|
||||
* happy.
|
||||
*/
|
||||
if (capabilities & NOMMU_MAP_DIRECT) {
|
||||
ret = call_mmap(vma->vm_file, vma);
|
||||
ret = mmap_file(vma->vm_file, vma);
|
||||
/* shouldn't return success if we're not sharing */
|
||||
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
|
||||
ret = -ENOSYS;
|
||||
|
@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
|
||||
static inline void account_freepages(struct zone *zone, int nr_pages,
|
||||
int migratetype)
|
||||
{
|
||||
lockdep_assert_held(&zone->lock);
|
||||
|
||||
if (is_migrate_isolate(migratetype))
|
||||
return;
|
||||
|
||||
@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
|
||||
|
||||
if (is_migrate_cma(migratetype))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
|
||||
else if (is_migrate_highatomic(migratetype))
|
||||
WRITE_ONCE(zone->nr_free_highatomic,
|
||||
zone->nr_free_highatomic + nr_pages);
|
||||
}
|
||||
|
||||
/* Used for pages not on another list */
|
||||
@ -961,9 +966,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
|
||||
break;
|
||||
case 2:
|
||||
/* the second tail page: deferred_list overlaps ->mapping */
|
||||
if (unlikely(!list_empty(&folio->_deferred_list) &&
|
||||
folio_test_partially_mapped(folio))) {
|
||||
bad_page(page, "partially mapped folio on deferred list");
|
||||
if (unlikely(!list_empty(&folio->_deferred_list))) {
|
||||
bad_page(page, "on deferred list");
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
@ -2682,7 +2686,6 @@ void free_unref_folios(struct folio_batch *folios)
|
||||
unsigned long pfn = folio_pfn(folio);
|
||||
unsigned int order = folio_order(folio);
|
||||
|
||||
folio_undo_large_rmappable(folio);
|
||||
if (!free_pages_prepare(&folio->page, order))
|
||||
continue;
|
||||
/*
|
||||
@ -3081,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
|
||||
|
||||
/*
|
||||
* If the caller does not have rights to reserves below the min
|
||||
* watermark then subtract the high-atomic reserves. This will
|
||||
* over-estimate the size of the atomic reserve but it avoids a search.
|
||||
* watermark then subtract the free pages reserved for highatomic.
|
||||
*/
|
||||
if (likely(!(alloc_flags & ALLOC_RESERVES)))
|
||||
unusable_free += z->nr_reserved_highatomic;
|
||||
unusable_free += READ_ONCE(z->nr_free_highatomic);
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
/* If allocation can't use CMA areas don't use free CMA pages */
|
||||
|
@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* arm64 - allow memory tagging on RAM-based files */
|
||||
vm_flags_set(vma, VM_MTE_ALLOWED);
|
||||
|
||||
file_accessed(file);
|
||||
/* This is anonymous shared memory if it is unlinked at the time of mmap */
|
||||
if (inode->i_nlink)
|
||||
|
@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
|
||||
}
|
||||
|
||||
page_cache_release(folio);
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
mem_cgroup_uncharge(folio);
|
||||
free_unref_page(&folio->page, folio_order(folio));
|
||||
}
|
||||
@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
|
||||
free_huge_folio(folio);
|
||||
continue;
|
||||
}
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
__page_cache_release(folio, &lruvec, &flags);
|
||||
|
||||
if (j != i)
|
||||
|
14
mm/vma.c
14
mm/vma.c
@ -323,11 +323,10 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
|
||||
/*
|
||||
* Close a vm structure and free it.
|
||||
*/
|
||||
void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
|
||||
void remove_vma(struct vm_area_struct *vma, bool unreachable)
|
||||
{
|
||||
might_sleep();
|
||||
if (!closed && vma->vm_ops && vma->vm_ops->close)
|
||||
vma->vm_ops->close(vma);
|
||||
vma_close(vma);
|
||||
if (vma->vm_file)
|
||||
fput(vma->vm_file);
|
||||
mpol_put(vma_policy(vma));
|
||||
@ -1115,9 +1114,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms,
|
||||
vms_clear_ptes(vms, mas_detach, true);
|
||||
mas_set(mas_detach, 0);
|
||||
mas_for_each(mas_detach, vma, ULONG_MAX)
|
||||
if (vma->vm_ops && vma->vm_ops->close)
|
||||
vma->vm_ops->close(vma);
|
||||
vms->closed_vm_ops = true;
|
||||
vma_close(vma);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1160,7 +1157,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
|
||||
/* Remove and clean up vmas */
|
||||
mas_set(mas_detach, 0);
|
||||
mas_for_each(mas_detach, vma, ULONG_MAX)
|
||||
remove_vma(vma, /* = */ false, vms->closed_vm_ops);
|
||||
remove_vma(vma, /* unreachable = */ false);
|
||||
|
||||
vm_unacct_memory(vms->nr_accounted);
|
||||
validate_mm(mm);
|
||||
@ -1684,8 +1681,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||
return new_vma;
|
||||
|
||||
out_vma_link:
|
||||
if (new_vma->vm_ops && new_vma->vm_ops->close)
|
||||
new_vma->vm_ops->close(new_vma);
|
||||
vma_close(new_vma);
|
||||
|
||||
if (new_vma->vm_file)
|
||||
fput(new_vma->vm_file);
|
||||
|
6
mm/vma.h
6
mm/vma.h
@ -42,8 +42,7 @@ struct vma_munmap_struct {
|
||||
int vma_count; /* Number of vmas that will be removed */
|
||||
bool unlock; /* Unlock after the munmap */
|
||||
bool clear_ptes; /* If there are outstanding PTE to be cleared */
|
||||
bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
|
||||
/* 1 byte hole */
|
||||
/* 2 byte hole */
|
||||
unsigned long nr_pages; /* Number of pages being removed */
|
||||
unsigned long locked_vm; /* Number of locked pages */
|
||||
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
|
||||
@ -198,7 +197,6 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
|
||||
vms->unmap_start = FIRST_USER_ADDRESS;
|
||||
vms->unmap_end = USER_PGTABLES_CEILING;
|
||||
vms->clear_ptes = false;
|
||||
vms->closed_vm_ops = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -269,7 +267,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
|
||||
unsigned long start, size_t len, struct list_head *uf,
|
||||
bool unlock);
|
||||
|
||||
void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
|
||||
void remove_vma(struct vm_area_struct *vma, bool unreachable);
|
||||
|
||||
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
|
||||
struct vm_area_struct *prev, struct vm_area_struct *next);
|
||||
|
@ -1476,7 +1476,7 @@ free_it:
|
||||
*/
|
||||
nr_reclaimed += nr_pages;
|
||||
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
if (folio_batch_add(&free_folios, folio) == 0) {
|
||||
mem_cgroup_uncharge_folios(&free_folios);
|
||||
try_to_unmap_flush();
|
||||
@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
|
||||
if (unlikely(folio_put_testzero(folio))) {
|
||||
__folio_clear_lru_flags(folio);
|
||||
|
||||
folio_undo_large_rmappable(folio);
|
||||
folio_unqueue_deferred_split(folio);
|
||||
if (folio_batch_add(&free_folios, folio) == 0) {
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
mem_cgroup_uncharge_folios(&free_folios);
|
||||
|
@ -44,13 +44,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
|
||||
if (fd < 0)
|
||||
ksft_exit_fail_perror("Error opening file\n");
|
||||
|
||||
/* Get the free huge pages before allocation */
|
||||
free_hpage_b = get_free_hugepages();
|
||||
if (free_hpage_b == 0) {
|
||||
close(fd);
|
||||
ksft_exit_skip("No free hugepage, exiting!\n");
|
||||
}
|
||||
|
||||
/* Allocate a hugetlb page */
|
||||
orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
|
||||
if (orig_buffer == MAP_FAILED) {
|
||||
@ -94,8 +87,20 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
|
||||
int main(void)
|
||||
{
|
||||
size_t pagesize = 0;
|
||||
int fd;
|
||||
|
||||
ksft_print_header();
|
||||
|
||||
/* Open the file to DIO */
|
||||
fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
|
||||
if (fd < 0)
|
||||
ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
|
||||
close(fd);
|
||||
|
||||
/* Check if huge pages are free */
|
||||
if (!get_free_hugepages())
|
||||
ksft_exit_skip("No free hugepage, exiting\n");
|
||||
|
||||
ksft_set_plan(4);
|
||||
|
||||
/* Get base page size */
|
||||
|
Loading…
Reference in New Issue
Block a user