diff --git a/mm/swapfile.c b/mm/swapfile.c index b98df62c6c43..f37029997dbf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,8 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -301,6 +303,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } +static inline unsigned int cluster_offset(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + return cluster_index(si, ci) * SWAPFILE_CLUSTER; +} + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -372,11 +380,15 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); + if (ci->flags & CLUSTER_FLAG_NONFULL) list_move_tail(&ci->list, &si->free_clusters); else list_add_tail(&ci->list, &si->free_clusters); ci->flags = CLUSTER_FLAG_FREE; + ci->order = 0; } /* @@ -431,9 +443,11 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi struct swap_cluster_info *ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); VM_BUG_ON(cluster_index(si, ci) != idx); + VM_BUG_ON(ci->count); list_del(&ci->list); - ci->count = 0; ci->flags = 0; return ci; } @@ -441,6 +455,8 @@ static struct swap_cluster_info *alloc_cluster(struct swap_info_struct *si, unsi static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -497,6 +513,9 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste return; VM_BUG_ON(ci->count == 0); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&p->lock); + lockdep_assert_held(&ci->lock); ci->count--; if (!ci->count) { @@ -505,121 +524,155 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste } if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); - ci->flags |= CLUSTER_FLAG_NONFULL; + ci->flags = CLUSTER_FLAG_NONFULL; } } -/* - * It's possible scan_swap_map_slots() uses a free cluster in the middle of free - * cluster list. Avoiding such abuse to avoid list corruption. - */ -static bool -scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset, int order) +static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, + unsigned int nr_pages) { - struct percpu_cluster *percpu_cluster; - bool conflict; - struct swap_cluster_info *first = list_first_entry(&si->free_clusters, - struct swap_cluster_info, list); + unsigned char *p = si->swap_map + start; + unsigned char *end = p + nr_pages; - offset /= SWAPFILE_CLUSTER; - conflict = !list_empty(&si->free_clusters) && - offset != cluster_index(si, first) && - cluster_is_free(&si->cluster_info[offset]); - - if (!conflict) - return false; - - percpu_cluster = this_cpu_ptr(si->percpu_cluster); - percpu_cluster->next[order] = SWAP_NEXT_INVALID; - return true; -} - -static inline bool swap_range_empty(char *swap_map, unsigned int start, - unsigned int nr_pages) -{ - unsigned int i; - - for (i = 0; i < nr_pages; i++) { - if (swap_map[start + i]) + while (p < end) + if (*p++) return false; - } return true; } + +static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) +{ + unsigned int nr_pages = 1 << order; + + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + ci->order = order; + } + + memset(si->swap_map + start, usage, nr_pages); + swap_range_alloc(si, start, nr_pages); + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL))); + list_del(&ci->list); + ci->flags = 0; + } +} + +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, + unsigned int *foundp, unsigned int order, + unsigned char usage) +{ + unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int nr_pages = 1 << order; + struct swap_cluster_info *ci; + + if (end < nr_pages) + return SWAP_NEXT_INVALID; + end -= nr_pages; + + ci = lock_cluster(si, offset); + if (ci->count + nr_pages > SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + + while (offset <= end) { + if (cluster_scan_range(si, offset, nr_pages)) { + cluster_alloc_range(si, ci, offset, usage, order); + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + offset += nr_pages; + break; + } + offset += nr_pages; + } + if (offset > end) + offset = SWAP_NEXT_INVALID; +done: + unlock_cluster(ci); + return offset; +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ -static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base, int order) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, + unsigned char usage) { - unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; - struct swap_cluster_info *ci; - unsigned int tmp, max; + struct swap_cluster_info *ci, *n; + unsigned int offset, found = 0; new_cluster: + lockdep_assert_held(&si->lock); cluster = this_cpu_ptr(si->percpu_cluster); - tmp = cluster->next[order]; - if (tmp == SWAP_NEXT_INVALID) { - if (!list_empty(&si->free_clusters)) { - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); - list_del(&ci->list); - spin_lock(&ci->lock); - ci->order = order; - ci->flags = 0; - spin_unlock(&ci->lock); - tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; - } else if (!list_empty(&si->nonfull_clusters[order])) { - ci = list_first_entry(&si->nonfull_clusters[order], - struct swap_cluster_info, list); - list_del(&ci->list); - spin_lock(&ci->lock); - ci->flags = 0; - spin_unlock(&ci->lock); - tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; - } else if (!list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); - *scan_base = this_cpu_read(*si->cluster_next_cpu); - *offset = *scan_base; - goto new_cluster; - } else - return false; + offset = cluster->next[order]; + if (offset) { + offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + if (found) + goto done; } - /* - * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster, maintaining - * natural alignment. - */ - max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); - if (tmp < max) { - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (swap_range_empty(si->swap_map, tmp, nr_pages)) - break; - tmp += nr_pages; - } - unlock_cluster(ci); + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + VM_BUG_ON(!found); + goto done; } - if (tmp >= max) { - cluster->next[order] = SWAP_NEXT_INVALID; + + if (order < PMD_ORDER) { + list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + if (found) + goto done; + } + } + + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + swap_do_scheduled_discard(si); goto new_cluster; } - *offset = tmp; - *scan_base = tmp; - tmp += nr_pages; - cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; - return true; + + if (order) + goto done; + + for (int o = 1; o < SWAP_NR_ORDERS; o++) { + if (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, + list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + VM_BUG_ON(!found); + goto done; + } + } + +done: + cluster->next[order] = offset; + return found; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -746,11 +799,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, return false; } +static int cluster_alloc_swap(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) +{ + int n_ret = 0; + + VM_BUG_ON(!si->cluster_info); + + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + + if (!offset) + break; + slots[n_ret++] = swp_entry(si->type, offset); + } + + return n_ret; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; @@ -789,26 +860,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } + if (si->cluster_info) + return cluster_alloc_swap(si, usage, nr, slots, order); + si->flags += SWP_SCANNING; - /* - * Use percpu scan base for SSD to reduce lock contention on - * cluster and swap cache. For HDD, sequential access is more - * important. - */ - if (si->flags & SWP_SOLIDSTATE) - scan_base = this_cpu_read(*si->cluster_next_cpu); - else - scan_base = si->cluster_next; + + /* For HDD, sequential access is more important. */ + scan_base = si->cluster_next; offset = scan_base; - /* SSD algorithm */ - if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } else if (unlikely(!si->cluster_nr--)) { + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -819,8 +880,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info - * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; @@ -848,19 +907,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } checks: - if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { - /* take a break if we already got some slots */ - if (n_ret) - goto done; - if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } - } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -868,11 +914,9 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); @@ -883,15 +927,12 @@ checks: } if (si->swap_map[offset]) { - unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); - add_cluster_info_page(si, si->cluster_info, offset, nr_pages); - unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); @@ -912,13 +953,7 @@ checks: latency_ration = LATENCY_LIMIT; } - /* try to get more slots in cluster */ - if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) - goto checks; - if (order > 0) - goto done; - } else if (si->cluster_nr && !si->swap_map[++offset]) { + if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; @@ -987,8 +1022,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) ci = lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); ci->count = 0; - ci->order = 0; - ci->flags = 0; free_cluster(si, ci); unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); @@ -2997,8 +3030,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, ci = cluster_info + idx; if (idx >= nr_clusters) continue; - if (ci->count) + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &p->nonfull_clusters[0]); continue; + } ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &p->free_clusters); }