linux/mm/page_io.c
Linus Torvalds e2c5923c34 Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe:
 "This is the main pull request for block storage for 4.15-rc1.

  Nothing out of the ordinary in here, and no API changes or anything
  like that. Just various new features for drivers, core changes, etc.
  In particular, this pull request contains:

   - A patch series from Bart, closing the whole on blk/scsi-mq queue
     quescing.

   - A series from Christoph, building towards hidden gendisks (for
     multipath) and ability to move bio chains around.

   - NVMe
        - Support for native multipath for NVMe (Christoph).
        - Userspace notifications for AENs (Keith).
        - Command side-effects support (Keith).
        - SGL support (Chaitanya Kulkarni)
        - FC fixes and improvements (James Smart)
        - Lots of fixes and tweaks (Various)

   - bcache
        - New maintainer (Michael Lyle)
        - Writeback control improvements (Michael)
        - Various fixes (Coly, Elena, Eric, Liang, et al)

   - lightnvm updates, mostly centered around the pblk interface
     (Javier, Hans, and Rakesh).

   - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)

   - Writeback series that fix the much discussed hundreds of millions
     of sync-all units. This goes all the way, as discussed previously
     (me).

   - Fix for missing wakeup on writeback timer adjustments (Yafang
     Shao).

   - Fix laptop mode on blk-mq (me).

   - {mq,name} tupple lookup for IO schedulers, allowing us to have
     alias names. This means you can use 'deadline' on both !mq and on
     mq (where it's called mq-deadline). (me).

   - blktrace race fix, oopsing on sg load (me).

   - blk-mq optimizations (me).

   - Obscure waitqueue race fix for kyber (Omar).

   - NBD fixes (Josef).

   - Disable writeback throttling by default on bfq, like we do on cfq
     (Luca Miccio).

   - Series from Ming that enable us to treat flush requests on blk-mq
     like any other request. This is a really nice cleanup.

   - Series from Ming that improves merging on blk-mq with schedulers,
     getting us closer to flipping the switch on scsi-mq again.

   - BFQ updates (Paolo).

   - blk-mq atomic flags memory ordering fixes (Peter Z).

   - Loop cgroup support (Shaohua).

   - Lots of minor fixes from lots of different folks, both for core and
     driver code"

* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
  nvme: fix visibility of "uuid" ns attribute
  blk-mq: fixup some comment typos and lengths
  ide: ide-atapi: fix compile error with defining macro DEBUG
  blk-mq: improve tag waiting setup for non-shared tags
  brd: remove unused brd_mutex
  blk-mq: only run the hardware queue if IO is pending
  block: avoid null pointer dereference on null disk
  fs: guard_bio_eod() needs to consider partitions
  xtensa/simdisk: fix compile error
  nvme: expose subsys attribute to sysfs
  nvme: create 'slaves' and 'holders' entries for hidden controllers
  block: create 'slaves' and 'holders' entries for hidden gendisks
  nvme: also expose the namespace identification sysfs files for mpath nodes
  nvme: implement multipath access to nvme subsystems
  nvme: track shared namespaces
  nvme: introduce a nvme_ns_ids structure
  nvme: track subsystems
  block, nvme: Introduce blk_mq_req_flags_t
  block, scsi: Make SCSI quiesce and resume work reliably
  block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
  ...
2017-11-14 15:32:19 -08:00

434 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/mm/page_io.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95,
* Asynchronous swapping added 30.12.95. Stephen Tweedie
* Removed race in async swapping. 14.4.1996. Bruno Haible
* Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
* Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
*/
#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
int i, nr = hpage_nr_pages(page);
struct bio *bio;
bio = bio_alloc(gfp_flags, nr);
if (bio) {
struct block_device *bdev;
bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
for (i = 0; i < nr; i++)
bio_add_page(bio, page + i, PAGE_SIZE, 0);
VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
}
return bio;
}
void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
if (bio->bi_status) {
SetPageError(page);
/*
* We failed to write the page out to swap-space.
* Re-dirty the page in order to avoid it being reclaimed.
* Also print a dire warning that things will go BAD (tm)
* very quickly.
*
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
}
static void swap_slot_free_notify(struct page *page)
{
struct swap_info_struct *sis;
struct gendisk *disk;
/*
* There is no guarantee that the page is in swap cache - the software
* suspend code (at least) uses end_swap_bio_read() against a non-
* swapcache page. So we must check PG_swapcache before proceeding with
* this optimization.
*/
if (unlikely(!PageSwapCache(page)))
return;
sis = page_swap_info(page);
if (!(sis->flags & SWP_BLKDEV))
return;
/*
* The swap subsystem performs lazy swap slot freeing,
* expecting that the page will be swapped out again.
* So we can avoid an unnecessary write if the page
* isn't redirtied.
* This is good for real swap storage because we can
* reduce unnecessary I/O and enhance wear-leveling
* if an SSD is used as the as swap device.
* But if in-memory swap device (eg zram) is used,
* this causes a duplicated copy between uncompressed
* data in VM-owned memory and compressed data in
* zram-owned memory. So let's free zram-owned memory
* and make the VM-owned decompressed page *dirty*,
* so the page should be swapped out somewhere again if
* we again wish to reclaim it.
*/
disk = sis->bdev->bd_disk;
if (disk->fops->swap_slot_free_notify) {
swp_entry_t entry;
unsigned long offset;
entry.val = page_private(page);
offset = swp_offset(entry);
SetPageDirty(page);
disk->fops->swap_slot_free_notify(sis->bdev,
offset);
}
}
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
struct task_struct *waiter = bio->bi_private;
if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
SetPageUptodate(page);
swap_slot_free_notify(page);
out:
unlock_page(page);
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
unsigned blocks_per_page;
unsigned long page_no;
unsigned blkbits;
sector_t probe_block;
sector_t last_block;
sector_t lowest_block = -1;
sector_t highest_block = 0;
int nr_extents = 0;
int ret;
blkbits = inode->i_blkbits;
blocks_per_page = PAGE_SIZE >> blkbits;
/*
* Map all the blocks into the extent list. This code doesn't try
* to be very smart.
*/
probe_block = 0;
page_no = 0;
last_block = i_size_read(inode) >> blkbits;
while ((probe_block + blocks_per_page) <= last_block &&
page_no < sis->max) {
unsigned block_in_page;
sector_t first_block;
cond_resched();
first_block = bmap(inode, probe_block);
if (first_block == 0)
goto bad_bmap;
/*
* It must be PAGE_SIZE aligned on-disk
*/
if (first_block & (blocks_per_page - 1)) {
probe_block++;
goto reprobe;
}
for (block_in_page = 1; block_in_page < blocks_per_page;
block_in_page++) {
sector_t block;
block = bmap(inode, probe_block + block_in_page);
if (block == 0)
goto bad_bmap;
if (block != first_block + block_in_page) {
/* Discontiguity */
probe_block++;
goto reprobe;
}
}
first_block >>= (PAGE_SHIFT - blkbits);
if (page_no) { /* exclude the header page */
if (first_block < lowest_block)
lowest_block = first_block;
if (first_block > highest_block)
highest_block = first_block;
}
/*
* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
*/
ret = add_swap_extent(sis, page_no, 1, first_block);
if (ret < 0)
goto out;
nr_extents += ret;
page_no++;
probe_block += blocks_per_page;
reprobe:
continue;
}
ret = nr_extents;
*span = 1 + highest_block - lowest_block;
if (page_no == 0)
page_no = 1; /* force Empty message */
sis->max = page_no;
sis->pages = page_no - 1;
sis->highest_bit = page_no - 1;
out:
return ret;
bad_bmap:
pr_err("swapon: swapfile has holes\n");
ret = -EINVAL;
goto out;
}
/*
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
int ret = 0;
if (try_to_free_swap(page)) {
unlock_page(page);
goto out;
}
if (frontswap_store(page) == 0) {
set_page_writeback(page);
unlock_page(page);
end_page_writeback(page);
goto out;
}
ret = __swap_writepage(page, wbc, end_swap_bio_write);
out:
return ret;
}
static sector_t swap_page_sector(struct page *page)
{
return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
static inline void count_swpout_vm_event(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (unlikely(PageTransHuge(page)))
count_vm_event(THP_SWPOUT);
#endif
count_vm_events(PSWPOUT, hpage_nr_pages(page));
}
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
struct bio *bio;
int ret;
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
if (sis->flags & SWP_FILE) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
struct bio_vec bv = {
.bv_page = page,
.bv_len = PAGE_SIZE,
.bv_offset = 0
};
struct iov_iter from;
iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
set_page_writeback(page);
unlock_page(page);
ret = mapping->a_ops->direct_IO(&kiocb, &from);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
} else {
/*
* In the case of swap-over-nfs, this can be a
* temporary failure if the system has limited
* memory for allocating transmit buffers.
* Mark the page dirty and avoid
* rotate_reclaimable_page but rate-limit the
* messages but do not flag PageError like
* the normal direct-to-bio case as it could
* be temporary.
*/
set_page_dirty(page);
ClearPageReclaim(page);
pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
page_file_offset(page));
}
end_page_writeback(page);
return ret;
}
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
count_swpout_vm_event(page);
return 0;
}
ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
ret = -ENOMEM;
goto out;
}
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
out:
return ret;
}
int swap_readpage(struct page *page, bool do_poll)
{
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
struct gendisk *disk;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
goto out;
}
if (sis->flags & SWP_FILE) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
ret = mapping->a_ops->readpage(swap_file, page);
if (!ret)
count_vm_event(PSWPIN);
return ret;
}
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
if (trylock_page(page)) {
swap_slot_free_notify(page);
unlock_page(page);
}
count_vm_event(PSWPIN);
return 0;
}
ret = 0;
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
goto out;
}
disk = bio->bi_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
bio_get(bio);
qc = submit_bio(bio);
while (do_poll) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio->bi_private))
break;
if (!blk_poll(disk->queue, qc))
break;
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
out:
return ret;
}
int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
if (sis->flags & SWP_FILE) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return mapping->a_ops->set_page_dirty(page);
} else {
return __set_page_dirty_no_writeback(page);
}
}