mm: swap: separate SSD allocation from scan_swap_map_slots()

Previously the SSD and HDD share the same swap_map scan loop in
scan_swap_map_slots().  This function is complex and hard to flow the
execution flow.

scan_swap_map_try_ssd_cluster() can already do most of the heavy lifting
to locate the candidate swap range in the cluster.  However it needs to go
back to scan_swap_map_slots() to check conflict and then perform the
allocation.

When scan_swap_map_try_ssd_cluster() failed, it still depended on the
scan_swap_map_slots() to do brute force scanning of the swap_map.  When
the swapfile is large and almost full, it will take some CPU time to go
through the swap_map array.

Get rid of the cluster allocation dependency on the swap_map scan loop in
scan_swap_map_slots().  Streamline the cluster allocation code path.  No
more conflict checks.

For order 0 swap entry, when run out of free and nonfull list.  It will
allocate from the higher order nonfull cluster list.

Users should see less CPU time spent on searching the free swap slot when
swapfile is almost full.

[ryncsn@gmail.com: fix array-bounds error with CONFIG_THP_SWAP=n]
  Link: https://lkml.kernel.org/r/CAMgjq7Bz0DY+rY0XgCoH7-Q=uHLdo3omi8kUr4ePDweNyofsbQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-3-cb9c148b9297@kernel.org
Signed-off-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Chris Li 2024-07-30 23:49:15 -07:00 committed by Andrew Morton
parent d07a46a4ac
commit 5f843a9a3a

View File

@ -53,6 +53,8 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char); unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *); static void free_swap_count_continuations(struct swap_info_struct *);
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries);
static DEFINE_SPINLOCK(swap_lock); static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles; static unsigned int nr_swapfiles;
@ -301,6 +303,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info; return ci - si->cluster_info;
} }
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset) unsigned long offset)
{ {
@ -372,11 +380,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{ {
lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
if (ci->flags & CLUSTER_FLAG_NONFULL) if (ci->flags & CLUSTER_FLAG_NONFULL)
list_move_tail(&ci->list, &si->free_clusters); list_move_tail(&ci->list, &si->free_clusters);
else else
list_add_tail(&ci->list, &si->free_clusters); list_add_tail(&ci->list, &si->free_clusters);
ci->flags = CLUSTER_FLAG_FREE; ci->flags = CLUSTER_FLAG_FREE;
ci->order = 0;
} }
/* /*
@ -431,9 +443,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
struct swap_cluster_info, list); struct swap_cluster_info, list);
lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
VM_BUG_ON(cluster_index(si, ci) != idx); VM_BUG_ON(cluster_index(si, ci) != idx);
VM_BUG_ON(ci->count);
list_del(&ci->list); list_del(&ci->list);
ci->count = 0;
ci->flags = 0; ci->flags = 0;
return ci; return ci;
} }
@ -441,6 +455,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{ {
VM_BUG_ON(ci->count != 0); VM_BUG_ON(ci->count != 0);
lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
/* /*
* If the swap is discardable, prepare discard the cluster * If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed * instead of free it immediately. The cluster will be freed
@ -497,6 +513,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
return; return;
VM_BUG_ON(ci->count == 0); VM_BUG_ON(ci->count == 0);
VM_BUG_ON(cluster_is_free(ci));
lockdep_assert_held(&p->lock);
lockdep_assert_held(&ci->lock);
ci->count--; ci->count--;
if (!ci->count) { if (!ci->count) {
@ -505,121 +524,155 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste
} }
if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
ci->flags |= CLUSTER_FLAG_NONFULL; ci->flags = CLUSTER_FLAG_NONFULL;
} }
} }
/* static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
* It's possible scan_swap_map_slots() uses a free cluster in the middle of free unsigned int nr_pages)
* cluster list. Avoiding such abuse to avoid list corruption.
*/
static bool
scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
unsigned long offset, int order)
{ {
struct percpu_cluster *percpu_cluster; unsigned char *p = si->swap_map + start;
bool conflict; unsigned char *end = p + nr_pages;
struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
struct swap_cluster_info, list);
offset /= SWAPFILE_CLUSTER; while (p < end)
conflict = !list_empty(&si->free_clusters) && if (*p++)
offset != cluster_index(si, first) &&
cluster_is_free(&si->cluster_info[offset]);
if (!conflict)
return false;
percpu_cluster = this_cpu_ptr(si->percpu_cluster);
percpu_cluster->next[order] = SWAP_NEXT_INVALID;
return true;
}
static inline bool swap_range_empty(char *swap_map, unsigned int start,
unsigned int nr_pages)
{
unsigned int i;
for (i = 0; i < nr_pages; i++) {
if (swap_map[start + i])
return false; return false;
}
return true; return true;
} }
static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
{
unsigned int nr_pages = 1 << order;
if (cluster_is_free(ci)) {
if (nr_pages < SWAPFILE_CLUSTER) {
list_move_tail(&ci->list, &si->nonfull_clusters[order]);
ci->flags = CLUSTER_FLAG_NONFULL;
}
ci->order = order;
}
memset(si->swap_map + start, usage, nr_pages);
swap_range_alloc(si, start, nr_pages);
ci->count += nr_pages;
if (ci->count == SWAPFILE_CLUSTER) {
VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
list_del(&ci->list);
ci->flags = 0;
}
}
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
unsigned int *foundp, unsigned int order,
unsigned char usage)
{
unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
unsigned int nr_pages = 1 << order;
struct swap_cluster_info *ci;
if (end < nr_pages)
return SWAP_NEXT_INVALID;
end -= nr_pages;
ci = lock_cluster(si, offset);
if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
offset = SWAP_NEXT_INVALID;
goto done;
}
while (offset <= end) {
if (cluster_scan_range(si, offset, nr_pages)) {
cluster_alloc_range(si, ci, offset, usage, order);
*foundp = offset;
if (ci->count == SWAPFILE_CLUSTER) {
offset = SWAP_NEXT_INVALID;
goto done;
}
offset += nr_pages;
break;
}
offset += nr_pages;
}
if (offset > end)
offset = SWAP_NEXT_INVALID;
done:
unlock_cluster(ci);
return offset;
}
/* /*
* Try to get swap entries with specified order from current cpu's swap entry * Try to get swap entries with specified order from current cpu's swap entry
* pool (a cluster). This might involve allocating a new cluster for current CPU * pool (a cluster). This might involve allocating a new cluster for current CPU
* too. * too.
*/ */
static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned long *offset, unsigned long *scan_base, int order) unsigned char usage)
{ {
unsigned int nr_pages = 1 << order;
struct percpu_cluster *cluster; struct percpu_cluster *cluster;
struct swap_cluster_info *ci; struct swap_cluster_info *ci, *n;
unsigned int tmp, max; unsigned int offset, found = 0;
new_cluster: new_cluster:
lockdep_assert_held(&si->lock);
cluster = this_cpu_ptr(si->percpu_cluster); cluster = this_cpu_ptr(si->percpu_cluster);
tmp = cluster->next[order]; offset = cluster->next[order];
if (tmp == SWAP_NEXT_INVALID) { if (offset) {
if (!list_empty(&si->free_clusters)) { offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); if (found)
list_del(&ci->list); goto done;
spin_lock(&ci->lock);
ci->order = order;
ci->flags = 0;
spin_unlock(&ci->lock);
tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
} else if (!list_empty(&si->nonfull_clusters[order])) {
ci = list_first_entry(&si->nonfull_clusters[order],
struct swap_cluster_info, list);
list_del(&ci->list);
spin_lock(&ci->lock);
ci->flags = 0;
spin_unlock(&ci->lock);
tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
} else if (!list_empty(&si->discard_clusters)) {
/*
* we don't have free cluster but have some clusters in
* discarding, do discard now and reclaim them, then
* reread cluster_next_cpu since we dropped si->lock
*/
swap_do_scheduled_discard(si);
*scan_base = this_cpu_read(*si->cluster_next_cpu);
*offset = *scan_base;
goto new_cluster;
} else
return false;
} }
/* if (!list_empty(&si->free_clusters)) {
* Other CPUs can use our cluster if they can't find a free cluster, ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
* check if there is still free entry in the cluster, maintaining offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
* natural alignment. VM_BUG_ON(!found);
*/ goto done;
max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
if (tmp < max) {
ci = lock_cluster(si, tmp);
while (tmp < max) {
if (swap_range_empty(si->swap_map, tmp, nr_pages))
break;
tmp += nr_pages;
}
unlock_cluster(ci);
} }
if (tmp >= max) {
cluster->next[order] = SWAP_NEXT_INVALID; if (order < PMD_ORDER) {
list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
&found, order, usage);
if (found)
goto done;
}
}
if (!list_empty(&si->discard_clusters)) {
/*
* we don't have free cluster but have some clusters in
* discarding, do discard now and reclaim them, then
* reread cluster_next_cpu since we dropped si->lock
*/
swap_do_scheduled_discard(si);
goto new_cluster; goto new_cluster;
} }
*offset = tmp;
*scan_base = tmp; if (order)
tmp += nr_pages; goto done;
cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
return true; for (int o = 1; o < SWAP_NR_ORDERS; o++) {
if (!list_empty(&si->nonfull_clusters[o])) {
ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
list);
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
&found, 0, usage);
VM_BUG_ON(!found);
goto done;
}
}
done:
cluster->next[order] = offset;
return found;
} }
static void __del_from_avail_list(struct swap_info_struct *p) static void __del_from_avail_list(struct swap_info_struct *p)
@ -746,11 +799,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
return false; return false;
} }
static int cluster_alloc_swap(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[], int order)
{
int n_ret = 0;
VM_BUG_ON(!si->cluster_info);
while (n_ret < nr) {
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
if (!offset)
break;
slots[n_ret++] = swp_entry(si->type, offset);
}
return n_ret;
}
static int scan_swap_map_slots(struct swap_info_struct *si, static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr, unsigned char usage, int nr,
swp_entry_t slots[], int order) swp_entry_t slots[], int order)
{ {
struct swap_cluster_info *ci;
unsigned long offset; unsigned long offset;
unsigned long scan_base; unsigned long scan_base;
unsigned long last_in_cluster = 0; unsigned long last_in_cluster = 0;
@ -789,26 +860,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return 0; return 0;
} }
if (si->cluster_info)
return cluster_alloc_swap(si, usage, nr, slots, order);
si->flags += SWP_SCANNING; si->flags += SWP_SCANNING;
/*
* Use percpu scan base for SSD to reduce lock contention on /* For HDD, sequential access is more important. */
* cluster and swap cache. For HDD, sequential access is more scan_base = si->cluster_next;
* important.
*/
if (si->flags & SWP_SOLIDSTATE)
scan_base = this_cpu_read(*si->cluster_next_cpu);
else
scan_base = si->cluster_next;
offset = scan_base; offset = scan_base;
/* SSD algorithm */ if (unlikely(!si->cluster_nr--)) {
if (si->cluster_info) {
if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
if (order > 0)
goto no_page;
goto scan;
}
} else if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1; si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks; goto checks;
@ -819,8 +880,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
/* /*
* If seek is expensive, start searching for new cluster from * If seek is expensive, start searching for new cluster from
* start of partition, to minimize the span of allocated swap. * start of partition, to minimize the span of allocated swap.
* If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
* case, just handled by scan_swap_map_try_ssd_cluster() above.
*/ */
scan_base = offset = si->lowest_bit; scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1; last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@ -848,19 +907,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
} }
checks: checks:
if (si->cluster_info) {
while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
/* take a break if we already got some slots */
if (n_ret)
goto done;
if (!scan_swap_map_try_ssd_cluster(si, &offset,
&scan_base, order)) {
if (order > 0)
goto no_page;
goto scan;
}
}
}
if (!(si->flags & SWP_WRITEOK)) if (!(si->flags & SWP_WRITEOK))
goto no_page; goto no_page;
if (!si->highest_bit) if (!si->highest_bit)
@ -868,11 +914,9 @@ checks:
if (offset > si->highest_bit) if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit; scan_base = offset = si->lowest_bit;
ci = lock_cluster(si, offset);
/* reuse swap entry of cache-only swap if not busy. */ /* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed; int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock); spin_unlock(&si->lock);
swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock); spin_lock(&si->lock);
@ -883,15 +927,12 @@ checks:
} }
if (si->swap_map[offset]) { if (si->swap_map[offset]) {
unlock_cluster(ci);
if (!n_ret) if (!n_ret)
goto scan; goto scan;
else else
goto done; goto done;
} }
memset(si->swap_map + offset, usage, nr_pages); memset(si->swap_map + offset, usage, nr_pages);
add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
unlock_cluster(ci);
swap_range_alloc(si, offset, nr_pages); swap_range_alloc(si, offset, nr_pages);
slots[n_ret++] = swp_entry(si->type, offset); slots[n_ret++] = swp_entry(si->type, offset);
@ -912,13 +953,7 @@ checks:
latency_ration = LATENCY_LIMIT; latency_ration = LATENCY_LIMIT;
} }
/* try to get more slots in cluster */ if (si->cluster_nr && !si->swap_map[++offset]) {
if (si->cluster_info) {
if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
goto checks;
if (order > 0)
goto done;
} else if (si->cluster_nr && !si->swap_map[++offset]) {
/* non-ssd case, still more slots in cluster? */ /* non-ssd case, still more slots in cluster? */
--si->cluster_nr; --si->cluster_nr;
goto checks; goto checks;
@ -987,8 +1022,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
ci = lock_cluster(si, offset); ci = lock_cluster(si, offset);
memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
ci->count = 0; ci->count = 0;
ci->order = 0;
ci->flags = 0;
free_cluster(si, ci); free_cluster(si, ci);
unlock_cluster(ci); unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER); swap_range_free(si, offset, SWAPFILE_CLUSTER);
@ -2997,8 +3030,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
ci = cluster_info + idx; ci = cluster_info + idx;
if (idx >= nr_clusters) if (idx >= nr_clusters)
continue; continue;
if (ci->count) if (ci->count) {
ci->flags = CLUSTER_FLAG_NONFULL;
list_add_tail(&ci->list, &p->nonfull_clusters[0]);
continue; continue;
}
ci->flags = CLUSTER_FLAG_FREE; ci->flags = CLUSTER_FLAG_FREE;
list_add_tail(&ci->list, &p->free_clusters); list_add_tail(&ci->list, &p->free_clusters);
} }