diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a2461f9a8738..9b8ab121d948 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -445,6 +445,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -506,6 +507,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -960,6 +962,23 @@ void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); +/* try to stablize folio_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) @@ -1434,6 +1453,18 @@ static inline void folio_memcg_unlock(struct folio *folio) { } +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match folio_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + static inline void mem_cgroup_handle_over_high(void) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a5ad9d050bf..7cc9ffc19e7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,11 @@ static inline unsigned long folio_pfn(struct folio *folio) return page_to_pfn(&folio->page); } +static inline struct folio *pfn_folio(unsigned long pfn) +{ + return page_folio(pfn_to_page(pfn)); +} + static inline atomic_t *folio_pincount_ptr(struct folio *folio) { return &folio_page(folio, 1)->compound_pincount; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e343420bfb1..9ef5aa37c60c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -375,6 +375,7 @@ enum lruvec_flags { #ifndef __GENERATING_BOUNDS_H struct lruvec; +struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) @@ -430,6 +431,7 @@ struct lru_gen_struct { }; void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg); @@ -442,6 +444,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ +} + #ifdef CONFIG_MEMCG static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { diff --git a/mm/internal.h b/mm/internal.h index 55ce10e4d0c0..cf134d58fd6d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); +void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 937141d48221..4ea49113b0dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ folio->memcg_data = (unsigned long)memcg; } diff --git a/mm/rmap.c b/mm/rmap.c index 131def40e4f0..2ff17b9aabd9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -825,6 +825,12 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + lru_gen_look_around(&pvmw); + referenced++; + } + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* diff --git a/mm/swap.c b/mm/swap.c index f74fd51fa9e1..0a3871a70952 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu) folio_batch_move_lru(fbatch, folio_activate_fn); } -static void folio_activate(struct folio *folio) +void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { @@ -385,7 +385,7 @@ static inline void folio_activate_drain(int cpu) { } -static void folio_activate(struct folio *folio) +void folio_activate(struct folio *folio) { struct lruvec *lruvec; diff --git a/mm/vmscan.c b/mm/vmscan.c index 674d336dfe00..986916c15bec 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1635,6 +1635,11 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; + /* folio_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + folio_mapped(folio) && folio_test_referenced(folio)) + goto keep_locked; + /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -3219,6 +3224,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) * the aging ******************************************************************************/ +/* promote pages accessed through page tables */ +static int folio_update_gen(struct folio *folio, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_folio() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_page_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + /* protect pages accessed multiple times through file descriptors */ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { @@ -3230,6 +3258,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* folio_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + new_gen = (old_gen + 1) % MAX_NR_GENS; new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); @@ -3244,6 +3277,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai return new_gen; } +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat) +{ + struct folio *folio; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + folio = pfn_folio(pfn); + if (folio_nid(folio) != pgdat->node_id) + return NULL; + + if (folio_memcg_rcu(folio) != memcg) + return NULL; + + return folio; +} + static void inc_min_seq(struct lruvec *lruvec, int type) { struct lru_gen_struct *lrugen = &lruvec->lrugen; @@ -3443,6 +3513,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); } +/* + * This function exploits spatial locality when shrink_page_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + + if (spin_is_contended(pvmw->ptl)) + return; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + folio = get_pfn_folio(pfn, memcg, pgdat); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_lru_gen(folio); + if (old_gen < 0) + folio_set_referenced(folio); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + folio_activate(folio); + } + return; + } + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + spin_lock_irq(&lruvec->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + if (folio_memcg_rcu(folio) != memcg) + continue; + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + } + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); +} + /****************************************************************************** * the eviction ******************************************************************************/ @@ -3479,6 +3657,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) return true; } + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + return true; + } + /* protected */ if (tier > tier_idx) { int hist = lru_hist_from_seq(lrugen->min_seq[type]);