Our users reported that there're some random latency spikes when their RT
process is running.  Finally we found that latency spike is caused by
FADV_DONTNEED.  Which may call lru_add_drain_all() to drain LRU cache on
remote CPUs, and then waits the per-cpu work to complete.  The wait time
is uncertain, which may be tens millisecond.
That behavior is unreasonable, because this process is bound to a specific
CPU and the file is only accessed by itself, IOW, there should be no
pagecache pages on a per-cpu pagevec of a remote CPU.  That unreasonable
behavior is partially caused by the wrong comparation of the number of
invalidated pages and the number of the target.  For example,
        if (count < (end_index - start_index + 1))
The count above is how many pages were invalidated in the local CPU, and
(end_index - start_index + 1) is how many pages should be invalidated.
The usage of (end_index - start_index + 1) is incorrect, because they are
virtual addresses, which may not mapped to pages.  Besides that, there may
be holes between start and end.  So we'd better check whether there are
still pages on per-cpu pagevec after drain the local cpu, and then decide
whether or not to call lru_add_drain_all().
After I applied it with a hotfix to our production environment, most of
the lru_add_drain_all() can be avoided.
Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20200923133318.14373-1-laoar.shao@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			220 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			220 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  * mm/fadvise.c
 | |
|  *
 | |
|  * Copyright (C) 2002, Linus Torvalds
 | |
|  *
 | |
|  * 11Jan2003	Andrew Morton
 | |
|  *		Initial version.
 | |
|  */
 | |
| 
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/file.h>
 | |
| #include <linux/fs.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/pagemap.h>
 | |
| #include <linux/backing-dev.h>
 | |
| #include <linux/pagevec.h>
 | |
| #include <linux/fadvise.h>
 | |
| #include <linux/writeback.h>
 | |
| #include <linux/syscalls.h>
 | |
| #include <linux/swap.h>
 | |
| 
 | |
| #include <asm/unistd.h>
 | |
| 
 | |
| #include "internal.h"
 | |
| 
 | |
| /*
 | |
|  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 | |
|  * deactivate the pages and clear PG_Referenced.
 | |
|  */
 | |
| 
 | |
| int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 | |
| {
 | |
| 	struct inode *inode;
 | |
| 	struct address_space *mapping;
 | |
| 	struct backing_dev_info *bdi;
 | |
| 	loff_t endbyte;			/* inclusive */
 | |
| 	pgoff_t start_index;
 | |
| 	pgoff_t end_index;
 | |
| 	unsigned long nrpages;
 | |
| 
 | |
| 	inode = file_inode(file);
 | |
| 	if (S_ISFIFO(inode->i_mode))
 | |
| 		return -ESPIPE;
 | |
| 
 | |
| 	mapping = file->f_mapping;
 | |
| 	if (!mapping || len < 0)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	bdi = inode_to_bdi(mapping->host);
 | |
| 
 | |
| 	if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
 | |
| 		switch (advice) {
 | |
| 		case POSIX_FADV_NORMAL:
 | |
| 		case POSIX_FADV_RANDOM:
 | |
| 		case POSIX_FADV_SEQUENTIAL:
 | |
| 		case POSIX_FADV_WILLNEED:
 | |
| 		case POSIX_FADV_NOREUSE:
 | |
| 		case POSIX_FADV_DONTNEED:
 | |
| 			/* no bad return value, but ignore advice */
 | |
| 			break;
 | |
| 		default:
 | |
| 			return -EINVAL;
 | |
| 		}
 | |
| 		return 0;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * Careful about overflows. Len == 0 means "as much as possible".  Use
 | |
| 	 * unsigned math because signed overflows are undefined and UBSan
 | |
| 	 * complains.
 | |
| 	 */
 | |
| 	endbyte = (u64)offset + (u64)len;
 | |
| 	if (!len || endbyte < len)
 | |
| 		endbyte = -1;
 | |
| 	else
 | |
| 		endbyte--;		/* inclusive */
 | |
| 
 | |
| 	switch (advice) {
 | |
| 	case POSIX_FADV_NORMAL:
 | |
| 		file->f_ra.ra_pages = bdi->ra_pages;
 | |
| 		spin_lock(&file->f_lock);
 | |
| 		file->f_mode &= ~FMODE_RANDOM;
 | |
| 		spin_unlock(&file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_RANDOM:
 | |
| 		spin_lock(&file->f_lock);
 | |
| 		file->f_mode |= FMODE_RANDOM;
 | |
| 		spin_unlock(&file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_SEQUENTIAL:
 | |
| 		file->f_ra.ra_pages = bdi->ra_pages * 2;
 | |
| 		spin_lock(&file->f_lock);
 | |
| 		file->f_mode &= ~FMODE_RANDOM;
 | |
| 		spin_unlock(&file->f_lock);
 | |
| 		break;
 | |
| 	case POSIX_FADV_WILLNEED:
 | |
| 		/* First and last PARTIAL page! */
 | |
| 		start_index = offset >> PAGE_SHIFT;
 | |
| 		end_index = endbyte >> PAGE_SHIFT;
 | |
| 
 | |
| 		/* Careful about overflow on the "+1" */
 | |
| 		nrpages = end_index - start_index + 1;
 | |
| 		if (!nrpages)
 | |
| 			nrpages = ~0UL;
 | |
| 
 | |
| 		force_page_cache_readahead(mapping, file, start_index, nrpages);
 | |
| 		break;
 | |
| 	case POSIX_FADV_NOREUSE:
 | |
| 		break;
 | |
| 	case POSIX_FADV_DONTNEED:
 | |
| 		if (!inode_write_congested(mapping->host))
 | |
| 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 | |
| 						   WB_SYNC_NONE);
 | |
| 
 | |
| 		/*
 | |
| 		 * First and last FULL page! Partial pages are deliberately
 | |
| 		 * preserved on the expectation that it is better to preserve
 | |
| 		 * needed memory than to discard unneeded memory.
 | |
| 		 */
 | |
| 		start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
 | |
| 		end_index = (endbyte >> PAGE_SHIFT);
 | |
| 		/*
 | |
| 		 * The page at end_index will be inclusively discarded according
 | |
| 		 * by invalidate_mapping_pages(), so subtracting 1 from
 | |
| 		 * end_index means we will skip the last page.  But if endbyte
 | |
| 		 * is page aligned or is at the end of file, we should not skip
 | |
| 		 * that page - discarding the last page is safe enough.
 | |
| 		 */
 | |
| 		if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
 | |
| 				endbyte != inode->i_size - 1) {
 | |
| 			/* First page is tricky as 0 - 1 = -1, but pgoff_t
 | |
| 			 * is unsigned, so the end_index >= start_index
 | |
| 			 * check below would be true and we'll discard the whole
 | |
| 			 * file cache which is not what was asked.
 | |
| 			 */
 | |
| 			if (end_index == 0)
 | |
| 				break;
 | |
| 
 | |
| 			end_index--;
 | |
| 		}
 | |
| 
 | |
| 		if (end_index >= start_index) {
 | |
| 			unsigned long nr_pagevec = 0;
 | |
| 
 | |
| 			/*
 | |
| 			 * It's common to FADV_DONTNEED right after
 | |
| 			 * the read or write that instantiates the
 | |
| 			 * pages, in which case there will be some
 | |
| 			 * sitting on the local LRU cache. Try to
 | |
| 			 * avoid the expensive remote drain and the
 | |
| 			 * second cache tree walk below by flushing
 | |
| 			 * them out right away.
 | |
| 			 */
 | |
| 			lru_add_drain();
 | |
| 
 | |
| 			invalidate_mapping_pagevec(mapping,
 | |
| 						start_index, end_index,
 | |
| 						&nr_pagevec);
 | |
| 
 | |
| 			/*
 | |
| 			 * If fewer pages were invalidated than expected then
 | |
| 			 * it is possible that some of the pages were on
 | |
| 			 * a per-cpu pagevec for a remote CPU. Drain all
 | |
| 			 * pagevecs and try again.
 | |
| 			 */
 | |
| 			if (nr_pagevec) {
 | |
| 				lru_add_drain_all();
 | |
| 				invalidate_mapping_pages(mapping, start_index,
 | |
| 						end_index);
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	default:
 | |
| 		return -EINVAL;
 | |
| 	}
 | |
| 	return 0;
 | |
| }
 | |
| EXPORT_SYMBOL(generic_fadvise);
 | |
| 
 | |
| int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 | |
| {
 | |
| 	if (file->f_op->fadvise)
 | |
| 		return file->f_op->fadvise(file, offset, len, advice);
 | |
| 
 | |
| 	return generic_fadvise(file, offset, len, advice);
 | |
| }
 | |
| EXPORT_SYMBOL(vfs_fadvise);
 | |
| 
 | |
| #ifdef CONFIG_ADVISE_SYSCALLS
 | |
| 
 | |
| int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 | |
| {
 | |
| 	struct fd f = fdget(fd);
 | |
| 	int ret;
 | |
| 
 | |
| 	if (!f.file)
 | |
| 		return -EBADF;
 | |
| 
 | |
| 	ret = vfs_fadvise(f.file, offset, len, advice);
 | |
| 
 | |
| 	fdput(f);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 | |
| {
 | |
| 	return ksys_fadvise64_64(fd, offset, len, advice);
 | |
| }
 | |
| 
 | |
| #ifdef __ARCH_WANT_SYS_FADVISE64
 | |
| 
 | |
| SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
 | |
| {
 | |
| 	return ksys_fadvise64_64(fd, offset, len, advice);
 | |
| }
 | |
| 
 | |
| #endif
 | |
| #endif
 |