forked from Minki/linux
67d46b296a
Rob van der Heij reported the following (paraphrased) on private mail. The scenario is that I want to avoid backups to fill up the page cache and purge stuff that is more likely to be used again (this is with s390x Linux on z/VM, so I don't give it as much memory that we don't care anymore). So I have something with LD_PRELOAD that intercepts the close() call (from tar, in this case) and issues a posix_fadvise() just before closing the file. This mostly works, except for small files (less than 14 pages) that remains in page cache after the face. Unfortunately Rob has not had a chance to test this exact patch but the test program below should be reproducing the problem he described. The issue is the per-cpu pagevecs for LRU additions. If the pages are added by one CPU but fadvise() is called on another then the pages remain resident as the invalidate_mapping_pages() only drains the local pagevecs via its call to pagevec_release(). The user-visible effect is that a program that uses fadvise() properly is not obeyed. A possible fix for this is to put the necessary smarts into invalidate_mapping_pages() to globally drain the LRU pagevecs if a pagevec page could not be discarded. The downside with this is that an inode cache shrink would send a global IPI and memory pressure potentially causing global IPI storms is very undesirable. Instead, this patch adds a check during fadvise(POSIX_FADV_DONTNEED) to check if invalidate_mapping_pages() discarded all the requested pages. If a subset of pages are discarded it drains the LRU pagevecs and tries again. If the second attempt fails, it assumes it is due to the pages being mapped, locked or dirty and does not care. With this patch, an application using fadvise() correctly will be obeyed but there is a downside that a malicious application can force the kernel to send global IPIs and increase overhead. If accepted, I would like this to be considered as a -stable candidate. It's not an urgent issue but it's a system call that is not working as advertised which is weak. The following test program demonstrates the problem. It should never report that pages are still resident but will without this patch. It assumes that CPU 0 and 1 exist. int main() { int fd; int pagesize = getpagesize(); ssize_t written = 0, expected; char *buf; unsigned char *vec; int resident, i; cpu_set_t set; /* Prepare a buffer for writing */ expected = FILESIZE_PAGES * pagesize; buf = malloc(expected + 1); if (buf == NULL) { printf("ENOMEM\n"); exit(EXIT_FAILURE); } buf[expected] = 0; memset(buf, 'a', expected); /* Prepare the mincore vec */ vec = malloc(FILESIZE_PAGES); if (vec == NULL) { printf("ENOMEM\n"); exit(EXIT_FAILURE); } /* Bind ourselves to CPU 0 */ CPU_ZERO(&set); CPU_SET(0, &set); if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) { perror("sched_setaffinity"); exit(EXIT_FAILURE); } /* open file, unlink and write buffer */ fd = open("fadvise-test-file", O_CREAT|O_EXCL|O_RDWR); if (fd == -1) { perror("open"); exit(EXIT_FAILURE); } unlink("fadvise-test-file"); while (written < expected) { ssize_t this_write; this_write = write(fd, buf + written, expected - written); if (this_write == -1) { perror("write"); exit(EXIT_FAILURE); } written += this_write; } free(buf); /* * Force ourselves to another CPU. If fadvise only flushes the local * CPUs pagevecs then the fadvise will fail to discard all file pages */ CPU_ZERO(&set); CPU_SET(1, &set); if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) { perror("sched_setaffinity"); exit(EXIT_FAILURE); } /* sync and fadvise to discard the page cache */ fsync(fd); if (posix_fadvise(fd, 0, expected, POSIX_FADV_DONTNEED) == -1) { perror("posix_fadvise"); exit(EXIT_FAILURE); } /* map the file and use mincore to see which parts of it are resident */ buf = mmap(NULL, expected, PROT_READ, MAP_SHARED, fd, 0); if (buf == NULL) { perror("mmap"); exit(EXIT_FAILURE); } if (mincore(buf, expected, vec) == -1) { perror("mincore"); exit(EXIT_FAILURE); } /* Check residency */ for (i = 0, resident = 0; i < FILESIZE_PAGES; i++) { if (vec[i]) resident++; } if (resident != 0) { printf("Nr unexpected pages resident: %d\n", resident); exit(EXIT_FAILURE); } munmap(buf, expected); close(fd); free(vec); exit(EXIT_SUCCESS); } Signed-off-by: Mel Gorman <mgorman@suse.de> Reported-by: Rob van der Heij <rvdheij@gmail.com> Tested-by: Rob van der Heij <rvdheij@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
171 lines
4.0 KiB
C
171 lines
4.0 KiB
C
/*
|
|
* mm/fadvise.c
|
|
*
|
|
* Copyright (C) 2002, Linus Torvalds
|
|
*
|
|
* 11Jan2003 Andrew Morton
|
|
* Initial version.
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/file.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/fadvise.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/swap.h>
|
|
|
|
#include <asm/unistd.h>
|
|
|
|
/*
|
|
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
|
|
* deactivate the pages and clear PG_Referenced.
|
|
*/
|
|
SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
|
|
{
|
|
struct fd f = fdget(fd);
|
|
struct address_space *mapping;
|
|
struct backing_dev_info *bdi;
|
|
loff_t endbyte; /* inclusive */
|
|
pgoff_t start_index;
|
|
pgoff_t end_index;
|
|
unsigned long nrpages;
|
|
int ret = 0;
|
|
|
|
if (!f.file)
|
|
return -EBADF;
|
|
|
|
if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
|
|
ret = -ESPIPE;
|
|
goto out;
|
|
}
|
|
|
|
mapping = f.file->f_mapping;
|
|
if (!mapping || len < 0) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (mapping->a_ops->get_xip_mem) {
|
|
switch (advice) {
|
|
case POSIX_FADV_NORMAL:
|
|
case POSIX_FADV_RANDOM:
|
|
case POSIX_FADV_SEQUENTIAL:
|
|
case POSIX_FADV_WILLNEED:
|
|
case POSIX_FADV_NOREUSE:
|
|
case POSIX_FADV_DONTNEED:
|
|
/* no bad return value, but ignore advice */
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
/* Careful about overflows. Len == 0 means "as much as possible" */
|
|
endbyte = offset + len;
|
|
if (!len || endbyte < len)
|
|
endbyte = -1;
|
|
else
|
|
endbyte--; /* inclusive */
|
|
|
|
bdi = mapping->backing_dev_info;
|
|
|
|
switch (advice) {
|
|
case POSIX_FADV_NORMAL:
|
|
f.file->f_ra.ra_pages = bdi->ra_pages;
|
|
spin_lock(&f.file->f_lock);
|
|
f.file->f_mode &= ~FMODE_RANDOM;
|
|
spin_unlock(&f.file->f_lock);
|
|
break;
|
|
case POSIX_FADV_RANDOM:
|
|
spin_lock(&f.file->f_lock);
|
|
f.file->f_mode |= FMODE_RANDOM;
|
|
spin_unlock(&f.file->f_lock);
|
|
break;
|
|
case POSIX_FADV_SEQUENTIAL:
|
|
f.file->f_ra.ra_pages = bdi->ra_pages * 2;
|
|
spin_lock(&f.file->f_lock);
|
|
f.file->f_mode &= ~FMODE_RANDOM;
|
|
spin_unlock(&f.file->f_lock);
|
|
break;
|
|
case POSIX_FADV_WILLNEED:
|
|
/* First and last PARTIAL page! */
|
|
start_index = offset >> PAGE_CACHE_SHIFT;
|
|
end_index = endbyte >> PAGE_CACHE_SHIFT;
|
|
|
|
/* Careful about overflow on the "+1" */
|
|
nrpages = end_index - start_index + 1;
|
|
if (!nrpages)
|
|
nrpages = ~0UL;
|
|
|
|
/*
|
|
* Ignore return value because fadvise() shall return
|
|
* success even if filesystem can't retrieve a hint,
|
|
*/
|
|
force_page_cache_readahead(mapping, f.file, start_index,
|
|
nrpages);
|
|
break;
|
|
case POSIX_FADV_NOREUSE:
|
|
break;
|
|
case POSIX_FADV_DONTNEED:
|
|
if (!bdi_write_congested(mapping->backing_dev_info))
|
|
__filemap_fdatawrite_range(mapping, offset, endbyte,
|
|
WB_SYNC_NONE);
|
|
|
|
/* First and last FULL page! */
|
|
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
|
|
end_index = (endbyte >> PAGE_CACHE_SHIFT);
|
|
|
|
if (end_index >= start_index) {
|
|
unsigned long count = invalidate_mapping_pages(mapping,
|
|
start_index, end_index);
|
|
|
|
/*
|
|
* If fewer pages were invalidated than expected then
|
|
* it is possible that some of the pages were on
|
|
* a per-cpu pagevec for a remote CPU. Drain all
|
|
* pagevecs and try again.
|
|
*/
|
|
if (count < (end_index - start_index + 1)) {
|
|
lru_add_drain_all();
|
|
invalidate_mapping_pages(mapping, start_index,
|
|
end_index);
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
}
|
|
out:
|
|
fdput(f);
|
|
return ret;
|
|
}
|
|
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
|
|
asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
|
|
{
|
|
return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
|
|
}
|
|
SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
|
|
#endif
|
|
|
|
#ifdef __ARCH_WANT_SYS_FADVISE64
|
|
|
|
SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
|
|
{
|
|
return sys_fadvise64_64(fd, offset, len, advice);
|
|
}
|
|
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
|
|
asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
|
|
{
|
|
return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
|
|
}
|
|
SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
|
|
#endif
|
|
|
|
#endif
|