20 hotfixes, 14 of which are cc:stable.

Three affect DAMON.  Lorenzo's five-patch series to address the
 mmap_region error handling is here also.
 
 Apart from that, various singletons.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZzBVmAAKCRDdBJ7gKXxA
 ju42AQD0EEnzW+zFyI+E7x5FwCmLL6ofmzM8Sw9YrKjaeShdZgEAhcyS2Rc/AaJq
 Uty2ZvVMDF2a9p9gqHfKKARBXEbN2w0=
 =n+lO
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2024-11-09-22-40' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "20 hotfixes, 14 of which are cc:stable.

  Three affect DAMON. Lorenzo's five-patch series to address the
  mmap_region error handling is here also.

  Apart from that, various singletons"

* tag 'mm-hotfixes-stable-2024-11-09-22-40' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  mailmap: add entry for Thorsten Blum
  ocfs2: remove entry once instead of null-ptr-dereference in ocfs2_xa_remove()
  signal: restore the override_rlimit logic
  fs/proc: fix compile warning about variable 'vmcore_mmap_ops'
  ucounts: fix counter leak in inc_rlimit_get_ucounts()
  selftests: hugetlb_dio: check for initial conditions to skip in the start
  mm: fix docs for the kernel parameter ``thp_anon=``
  mm/damon/core: avoid overflow in damon_feed_loop_next_input()
  mm/damon/core: handle zero schemes apply interval
  mm/damon/core: handle zero {aggregation,ops_update} intervals
  mm/mlock: set the correct prev on failure
  objpool: fix to make percpu slot allocation more robust
  mm/page_alloc: keep track of free highatomic
  mm: resolve faulty mmap_region() error path behaviour
  mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling
  mm: refactor map_deny_write_exec()
  mm: unconditionally close VMAs on error
  mm: avoid unsafe VMA hook invocation when error arises on mmap hook
  mm/thp: fix deferred split unqueue naming and locking
  mm/thp: fix deferred split queue not partially_mapped
This commit is contained in:
Linus Torvalds 2024-11-10 09:04:27 -08:00
commit 28e43197c4
30 changed files with 329 additions and 172 deletions

View File

@ -665,6 +665,7 @@ Tomeu Vizoso <tomeu@tomeuvizoso.net> <tomeu.vizoso@collabora.com>
Thomas Graf <tgraf@suug.ch>
Thomas Körper <socketcan@esd.eu> <thomas.koerper@esd.eu>
Thomas Pedersen <twp@codeaurora.org>
Thorsten Blum <thorsten.blum@linux.dev> <thorsten.blum@toblux.com>
Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
Tingwei Zhang <quic_tingwei@quicinc.com> <tingwei@codeaurora.org>
Tirupathi Reddy <quic_tirupath@quicinc.com> <tirupath@codeaurora.org>

View File

@ -6688,7 +6688,7 @@
0: no polling (default)
thp_anon= [KNL]
Format: <size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>
Format: <size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>
state is one of "always", "madvise", "never" or "inherit".
Control the default behavior of the system with respect
to anonymous transparent hugepages.

View File

@ -303,7 +303,7 @@ control by passing the parameter ``transparent_hugepage=always`` or
kernel command line.
Alternatively, each supported anonymous THP size can be controlled by
passing ``thp_anon=<size>,<size>[KMG]:<state>;<size>-<size>[KMG]:<state>``,
passing ``thp_anon=<size>[KMG],<size>[KMG]:<state>;<size>[KMG]-<size>[KMG]:<state>``,
where ``<size>`` is the THP size (must be a power of 2 of PAGE_SIZE and
supported anonymous THP) and ``<state>`` is one of ``always``, ``madvise``,
``never`` or ``inherit``.

View File

@ -6,6 +6,8 @@
#ifndef BUILD_VDSO
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/shmem_fs.h>
#include <linux/types.h>
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
unsigned long flags)
{
/*
* Only allow MTE on anonymous mappings as these are guaranteed to be
* backed by tags-capable memory. The vm_flags may be overridden by a
* filesystem supporting MTE (RAM-based).
*/
if (system_supports_mte() && (flags & MAP_ANONYMOUS))
if (system_supports_mte() &&
((flags & MAP_ANONYMOUS) || shmem_file(file)))
return VM_MTE_ALLOWED;
return 0;
}
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
static inline bool arch_validate_prot(unsigned long prot,
unsigned long addr __always_unused)

View File

@ -2,6 +2,7 @@
#ifndef __ASM_MMAN_H__
#define __ASM_MMAN_H__
#include <linux/fs.h>
#include <uapi/asm/mman.h>
/* PARISC cannot allow mdwe as it needs writable stacks */
@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
{
/*
* The stack on parisc grows upwards, so if userspace requests memory
@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
return 0;
}
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
#endif /* __ASM_MMAN_H__ */

View File

@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
rc = 0;
ocfs2_xa_cleanup_value_truncate(loc, "removing",
orig_clusters);
if (rc)
goto out;
goto out;
}
}

View File

@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#endif
}
static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
* @size: size of buffer
@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size)
* virtually contiguous user-space in ELF layout.
*/
#ifdef CONFIG_MMU
static const struct vm_operations_struct vmcore_mmap_ops = {
.fault = mmap_vmcore_fault,
};
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.

View File

@ -2,6 +2,7 @@
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>
@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
#endif
#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(flags) 0
#define arch_calc_vm_flag_bits(file, flags) 0
#endif
#ifndef arch_validate_prot
@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
arch_calc_vm_flag_bits(flags);
arch_calc_vm_flag_bits(file, flags);
}
unsigned long vm_commit_limit(void);
@ -188,16 +189,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
*
* This is only applicable if the user has set the Memory-Deny-Write-Execute
* (MDWE) protection mask for the current process.
*
* @old specifies the VMA flags the VMA originally possessed, and @new the ones
* we propose to set.
*
* Return: false if proposed change is OK, true if not ok and should be denied.
*/
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
/* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
return false;
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
/* If the new VMA is not executable, we have nothing to deny. */
if (!(new & VM_EXEC))
return false;
/* Under MDWE we do not accept newly writably executable VMAs... */
if (new & VM_WRITE)
return true;
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
/* ...nor previously non-executable VMAs becoming executable. */
if (!(old & VM_EXEC))
return true;
return false;

View File

@ -823,6 +823,7 @@ struct zone {
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
unsigned long nr_free_highatomic;
/*
* We don't know if the memory that we're going to allocate will be

View File

@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty
long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
bool override_rlimit);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

View File

@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
*/
rcu_read_lock();
ucounts = task_ucounts(t);
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
override_rlimit);
rcu_read_unlock();
if (!sigpending)
return NULL;

View File

@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
bool override_rlimit)
{
/* Caller must hold a reference to ucounts */
struct ucounts *iter;
@ -317,10 +318,11 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
long new = atomic_long_add_return(1, &iter->rlimit[type]);
if (new < 0 || new > max)
goto unwind;
goto dec_unwind;
if (iter == ucounts)
ret = new;
max = get_userns_rlimit_max(iter->ns, type);
if (!override_rlimit)
max = get_userns_rlimit_max(iter->ns, type);
/*
* Grab an extra ucount reference for the caller when
* the rlimit count was previously 0.
@ -334,7 +336,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
dec_unwind:
dec = atomic_long_sub_return(1, &iter->rlimit[type]);
WARN_ON_ONCE(dec < 0);
unwind:
do_dec_rlimit_put_ucounts(ucounts, iter, type);
return 0;
}

View File

@ -74,15 +74,21 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
* warm caches and TLB hits. in default vmalloc is used to
* reduce the pressure of kernel slab system. as we know,
* mimimal size of vmalloc is one page since vmalloc would
* always align the requested size to page size
* always align the requested size to page size.
* but if vmalloc fails or it is not available (e.g. GFP_ATOMIC)
* allocate percpu slot with kmalloc.
*/
if ((pool->gfp & GFP_ATOMIC) == GFP_ATOMIC)
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
else
slot = NULL;
if ((pool->gfp & (GFP_ATOMIC | GFP_KERNEL)) != GFP_ATOMIC)
slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
cpu_to_node(i), __builtin_return_address(0));
if (!slot)
return -ENOMEM;
if (!slot) {
slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
if (!slot)
return -ENOMEM;
}
memset(slot, 0, size);
pool->cpu_slots[i] = slot;

View File

@ -1412,7 +1412,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
if (!s->wmarks.activated)
@ -1456,17 +1456,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
unsigned long score)
{
const unsigned long goal = 10000;
unsigned long score_goal_diff = max(goal, score) - min(goal, score);
unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal;
unsigned long compensation = last_input * score_goal_diff_bp / 10000;
/* Set minimum input as 10000 to avoid compensation be zero */
const unsigned long min_input = 10000;
unsigned long score_goal_diff, compensation;
bool over_achieving = score > goal;
if (goal > score)
if (score == goal)
return last_input;
if (score >= goal * 2)
return min_input;
if (over_achieving)
score_goal_diff = score - goal;
else
score_goal_diff = goal - score;
if (last_input < ULONG_MAX / score_goal_diff)
compensation = last_input * score_goal_diff / goal;
else
compensation = last_input / goal * score_goal_diff;
if (over_achieving)
return max(last_input - compensation, min_input);
if (last_input < ULONG_MAX - compensation)
return last_input + compensation;
if (last_input > compensation + min_input)
return last_input - compensation;
return min_input;
return ULONG_MAX;
}
#ifdef CONFIG_PSI
@ -1622,7 +1636,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
bool has_schemes_to_apply = false;
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
if (!s->wmarks.activated)
@ -1642,9 +1656,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals != s->next_apply_sis)
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
s->next_apply_sis +=
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
}
@ -2000,7 +2014,7 @@ static int kdamond_fn(void *data)
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
if (ctx->passed_sample_intervals == next_aggregation_sis) {
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
kdamond_merge_regions(ctx,
max_nr_accesses / 10,
sz_limit);
@ -2018,7 +2032,7 @@ static int kdamond_fn(void *data)
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
if (ctx->passed_sample_intervals == next_aggregation_sis) {
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
ctx->next_aggregation_sis = next_aggregation_sis +
ctx->attrs.aggr_interval / sample_interval;
@ -2028,7 +2042,7 @@ static int kdamond_fn(void *data)
ctx->ops.reset_aggregated(ctx);
}
if (ctx->passed_sample_intervals == next_ops_update_sis) {
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
ctx->next_ops_update_sis = next_ops_update_sis +
ctx->attrs.ops_update_interval /
sample_interval;

View File

@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
void __folio_undo_large_rmappable(struct folio *folio)
/*
* __folio_unqueue_deferred_split() is not to be called directly:
* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
* limits its calls to those folios which may have a _deferred_list for
* queueing THP splits, and that list is (racily observed to be) non-empty.
*
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
* zero: because even when split_queue_lock is held, a non-empty _deferred_list
* might be in use on deferred_split_scan()'s unlocked on-stack list.
*
* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
* therefore important to unqueue deferred split before changing folio memcg.
*/
bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
bool unqueued = false;
WARN_ON_ONCE(folio_ref_count(folio));
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
* And, if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in shrinker.
*
* Check the swapcache flag to determine if the folio is being
* handled by page reclaim since THP swap would add the folio into
* swap cache before calling try_to_unmap().
* Exclude swapcache: originally to avoid a corrupt deferred split
* queue. Nowadays that is fully prevented by mem_cgroup_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
@ -3718,8 +3735,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list);
struct folio *folio, *next;
int split = 0;
struct folio *folio, *next, *prev = NULL;
int split = 0, removed = 0;
#ifdef CONFIG_MEMCG
if (sc->memcg)
@ -3775,15 +3792,28 @@ next:
*/
if (!did_split && !folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
removed++;
} else {
/*
* That unlocked list_del_init() above would be unsafe,
* unless its folio is separated from any earlier folios
* left on the list (which may be concurrently unqueued)
* by one safe folio with refcount still raised.
*/
swap(folio, prev);
}
folio_put(folio);
if (folio)
folio_put(folio);
}
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
list_splice_tail(&list, &ds_queue->split_queue);
ds_queue->split_queue_len -= removed;
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
if (prev)
folio_put(prev);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.

View File

@ -108,6 +108,51 @@ static inline void *folio_raw_mapping(const struct folio *folio)
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}
/*
* This is a file-backed mapping, and is about to be memory mapped - invoke its
* mmap hook and safely handle error conditions. On error, VMA hooks will be
* mutated.
*
* @file: File which backs the mapping.
* @vma: VMA which we are mapping.
*
* Returns: 0 if success, error otherwise.
*/
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
int err = call_mmap(file, vma);
if (likely(!err))
return 0;
/*
* OK, we tried to call the file hook for mmap(), but an error
* arose. The mapping is in an inconsistent state and we most not invoke
* any further hooks on it.
*/
vma->vm_ops = &vma_dummy_vm_ops;
return err;
}
/*
* If the VMA has a close hook then close it, and since closing it might leave
* it in an inconsistent state which makes the use of any hooks suspect, clear
* them down by installing dummy empty hooks.
*/
static inline void vma_close(struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->close) {
vma->vm_ops->close(vma);
/*
* The mapping is in an inconsistent state, and no further hooks
* may be invoked upon it.
*/
vma->vm_ops = &vma_dummy_vm_ops;
}
}
#ifdef CONFIG_MMU
/* Flags for folio_pte_batch(). */
@ -639,11 +684,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
void __folio_undo_large_rmappable(struct folio *folio);
static inline void folio_undo_large_rmappable(struct folio *folio)
bool __folio_unqueue_deferred_split(struct folio *folio);
static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
return;
return false;
/*
* At this point, there is no one trying to add the folio to
@ -651,9 +696,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
return;
return false;
__folio_undo_large_rmappable(folio);
return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)

View File

@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
bool tried_split_before = false;
retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
/*
* Deferred split queue locking depends on memcg,
* and unqueue is unsafe unless folio refcount is 0:
* split or skip if on the queue? first try to split.
*/
if (!list_empty(&folio->_deferred_list)) {
spin_unlock(ptl);
if (!tried_split_before)
split_folio(folio);
folio_unlock(folio);
folio_put(folio);
if (tried_split_before)
return 0;
tried_split_before = true;
goto retry_pmd;
}
/*
* So long as that pmd lock is held, the folio cannot
* be racily added to the _deferred_list, because
* __folio_remove_rmap() will find !partially_mapped.
*/
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {

View File

@ -4629,10 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
!folio_test_hugetlb(folio) &&
!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio), folio);
/*
* Nobody should be changing or seriously looking at
@ -4679,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@ -4790,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@ -4976,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))

View File

@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
/* Take off deferred split queue while frozen and memcg set */
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:

View File

@ -725,14 +725,17 @@ static int apply_mlockall_flags(int flags)
}
for_each_vma(vmi, vma) {
int error;
vm_flags_t newflags;
newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
/* Ignore errors */
mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
newflags);
error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
newflags);
/* Ignore errors, but prev needs fixing up. */
if (error)
prev = vma;
cond_resched();
}
out:

130
mm/mmap.c
View File

@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* Obtain the address to map to. we verify (or select) it and ensure
@ -1358,20 +1358,18 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
unsigned long mmap_region(struct file *file, unsigned long addr,
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
pgoff_t pglen = PHYS_PFN(len);
struct vm_area_struct *merge;
unsigned long charged = 0;
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
unsigned long end = addr + len;
bool writable_file_mapping = false;
int error;
VMA_ITERATOR(vmi, mm, addr);
VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
@ -1422,7 +1420,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/*
* clear PTEs while the vma is still in the tree so that rmap
* cannot race with the freeing later in the truncate scenario.
* This is also needed for call_mmap(), which is why vm_ops
* This is also needed for mmap_file(), which is why vm_ops
* close function is called.
*/
vms_clean_up_area(&vms, &mas_detach);
@ -1445,35 +1443,35 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
if (vma_iter_prealloc(&vmi, vma)) {
error = -ENOMEM;
goto free_vma;
}
if (file) {
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
error = mmap_file(file, vma);
if (error)
goto unmap_and_free_vma;
if (vma_is_shared_maywrite(vma)) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto close_and_free_vma;
writable_file_mapping = true;
}
goto unmap_and_free_file_vma;
/* Drivers cannot alter the address of the VMA. */
WARN_ON_ONCE(addr != vma->vm_start);
/*
* Expansion is handled above, merging is handled below.
* Drivers should not alter the address of the VMA.
* Drivers should not permit writability when previously it was
* disallowed.
*/
if (WARN_ON((addr != vma->vm_start))) {
error = -EINVAL;
goto close_and_free_vma;
}
VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
!(vm_flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
vma_iter_config(&vmi, addr, end);
/*
* If vm_flags changed after call_mmap(), we should try merge
* If vm_flags changed after mmap_file(), we should try merge
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
struct vm_area_struct *merge;
vmg.flags = vma->vm_flags;
/* If this fails, state is reset ready for a reattempt. */
merge = vma_merge_new_range(&vmg);
@ -1491,7 +1489,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
goto file_expanded;
}
vma_iter_config(&vmi, addr, end);
}
@ -1500,26 +1498,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
goto free_iter_vma;
} else {
vma_set_anonymous(vma);
}
if (map_deny_write_exec(vma, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
/* Allow architectures to sanity-check the vm_flags */
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
goto close_and_free_vma;
}
if (vma_iter_prealloc(&vmi, vma)) {
error = -ENOMEM;
goto close_and_free_vma;
}
#ifdef CONFIG_SPARC64
/* TODO: Fix SPARC ADI! */
WARN_ON_ONCE(!arch_validate_flags(vm_flags));
#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
@ -1533,10 +1520,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
khugepaged_enter_vma(vma, vma->vm_flags);
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
file_expanded:
file = vma->vm_file;
ksm_add_vma(vma);
expanded:
@ -1569,24 +1553,17 @@ expanded:
vma_set_page_prot(vma);
validate_mm(mm);
return addr;
close_and_free_vma:
if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
unmap_and_free_file_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
if (file || vma->vm_file) {
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
}
if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
free_iter_vma:
vma_iter_free(&vmi);
free_vma:
vm_area_free(vma);
unacct_error:
@ -1596,10 +1573,43 @@ unacct_error:
abort_munmap:
vms_abort_munmap_vmas(&vms, &mas_detach);
gather_failed:
validate_mm(mm);
return error;
}
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
unsigned long ret;
bool writable_file_mapping = false;
/* Check to see if MDWE is applicable. */
if (map_deny_write_exec(vm_flags, vm_flags))
return -EACCES;
/* Allow architectures to sanity-check the vm_flags. */
if (!arch_validate_flags(vm_flags))
return -EINVAL;
/* Map writable and ensure this isn't a sealed memfd. */
if (file && is_shared_maywrite(vm_flags)) {
int error = mapping_map_writable(file->f_mapping);
if (error)
return error;
writable_file_mapping = true;
}
ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
/* Clear our write mapping regardless of error. */
if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
validate_mm(current->mm);
return ret;
}
static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
int ret;
@ -1934,7 +1944,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
remove_vma(vma, /* unreachable = */ true);
count++;
cond_resched();
vma = vma_next(&vmi);

View File

@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
if (map_deny_write_exec(vma, newflags)) {
if (map_deny_write_exec(vma->vm_flags, newflags)) {
error = -EACCES;
break;
}

View File

@ -589,8 +589,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
*/
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
put_nommu_region(vma->vm_region);
@ -843,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
if (!file) {
/*
@ -885,7 +884,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
{
int ret;
ret = call_mmap(vma->vm_file, vma);
ret = mmap_file(vma->vm_file, vma);
if (ret == 0) {
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
@ -918,7 +917,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
ret = mmap_file(vma->vm_file, vma);
/* shouldn't return success if we're not sharing */
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
ret = -ENOSYS;

View File

@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
static inline void account_freepages(struct zone *zone, int nr_pages,
int migratetype)
{
lockdep_assert_held(&zone->lock);
if (is_migrate_isolate(migratetype))
return;
@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
else if (is_migrate_highatomic(migratetype))
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
/* Used for pages not on another list */
@ -961,9 +966,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
break;
case 2:
/* the second tail page: deferred_list overlaps ->mapping */
if (unlikely(!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio))) {
bad_page(page, "partially mapped folio on deferred list");
if (unlikely(!list_empty(&folio->_deferred_list))) {
bad_page(page, "on deferred list");
goto out;
}
break;
@ -2682,7 +2686,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
@ -3081,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
/*
* If the caller does not have rights to reserves below the min
* watermark then subtract the high-atomic reserves. This will
* over-estimate the size of the atomic reserve but it avoids a search.
* watermark then subtract the free pages reserved for highatomic.
*/
if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += z->nr_reserved_highatomic;
unusable_free += READ_ONCE(z->nr_free_highatomic);
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */

View File

@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
if (ret)
return ret;
/* arm64 - allow memory tagging on RAM-based files */
vm_flags_set(vma, VM_MTE_ALLOWED);
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
if (inode->i_nlink)

View File

@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
free_huge_folio(folio);
continue;
}
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)

View File

@ -323,11 +323,10 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
/*
* Close a vm structure and free it.
*/
void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
if (!closed && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
@ -1115,9 +1114,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms,
vms_clear_ptes(vms, mas_detach, true);
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
vms->closed_vm_ops = true;
vma_close(vma);
}
/*
@ -1160,7 +1157,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
remove_vma(vma, /* = */ false, vms->closed_vm_ops);
remove_vma(vma, /* unreachable = */ false);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@ -1684,8 +1681,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
if (new_vma->vm_ops && new_vma->vm_ops->close)
new_vma->vm_ops->close(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
fput(new_vma->vm_file);

View File

@ -42,8 +42,7 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
/* 1 byte hole */
/* 2 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
@ -198,7 +197,6 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
vms->clear_ptes = false;
vms->closed_vm_ops = false;
}
#endif
@ -269,7 +267,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
void remove_vma(struct vm_area_struct *vma, bool unreachable);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);

View File

@ -1476,7 +1476,7 @@ free_it:
*/
nr_reclaimed += nr_pages;
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
folio_undo_large_rmappable(folio);
folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);

View File

@ -44,13 +44,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
if (fd < 0)
ksft_exit_fail_perror("Error opening file\n");
/* Get the free huge pages before allocation */
free_hpage_b = get_free_hugepages();
if (free_hpage_b == 0) {
close(fd);
ksft_exit_skip("No free hugepage, exiting!\n");
}
/* Allocate a hugetlb page */
orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
if (orig_buffer == MAP_FAILED) {
@ -94,8 +87,20 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
int main(void)
{
size_t pagesize = 0;
int fd;
ksft_print_header();
/* Open the file to DIO */
fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
if (fd < 0)
ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
close(fd);
/* Check if huge pages are free */
if (!get_free_hugepages())
ksft_exit_skip("No free hugepage, exiting\n");
ksft_set_plan(4);
/* Get base page size */