linux/fs
Davide Libenzi 5071f97ec6 epoll: fix epoll's own poll
Fix a bug inside the epoll's f_op->poll() code, that returns POLLIN even
though there are no actual ready monitored fds.  The bug shows up if you
add an epoll fd inside another fd container (poll, select, epoll).

The problem is that callback-based wake ups used by epoll does not carry
(patches will follow, to fix this) any information about the events that
actually happened.  So the callback code, since it can't call the file*
->poll() inside the callback, chains the file* into a ready-list.

So, suppose you added an fd with EPOLLOUT only, and some data shows up on
the fd, the file* mapped by the fd will be added into the ready-list (via
wakeup callback).  During normal epoll_wait() use, this condition is
sorted out at the time we're actually able to call the file*'s
f_op->poll().

Inside the old epoll's f_op->poll() though, only a quick check
!list_empty(ready-list) was performed, and this could have led to
reporting POLLIN even though no ready fds would show up at a following
epoll_wait().  In order to correctly report the ready status for an epoll
fd, the ready-list must be checked to see if any really available fd+event
would be ready in a following epoll_wait().

Operation (calling f_op->poll() from inside f_op->poll()) that, like wake
ups, must be handled with care because of the fact that epoll fds can be
added to other epoll fds.

Test code:

/*
 *  epoll_test by Davide Libenzi (Simple code to test epoll internals)
 *  Copyright (C) 2008  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include <sys/types.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <limits.h>
#include <poll.h>
#include <sys/epoll.h>
#include <sys/wait.h>

#define EPWAIT_TIMEO	(1 * 1000)
#ifndef POLLRDHUP
#define POLLRDHUP 0x2000
#endif

#define EPOLL_MAX_CHAIN	100L

#define EPOLL_TF_LOOP (1 << 0)

struct epoll_test_cfg {
	long size;
	long flags;
};

static int xepoll_create(int n) {
	int epfd;

	if ((epfd = epoll_create(n)) == -1) {
		perror("epoll_create");
		exit(2);
	}

	return epfd;
}

static void xepoll_ctl(int epfd, int cmd, int fd, struct epoll_event *evt) {
	if (epoll_ctl(epfd, cmd, fd, evt) < 0) {
		perror("epoll_ctl");
		exit(3);
	}
}

static void xpipe(int *fds) {
	if (pipe(fds)) {
		perror("pipe");
		exit(4);
	}
}

static pid_t xfork(void) {
	pid_t pid;

	if ((pid = fork()) == (pid_t) -1) {
		perror("pipe");
		exit(5);
	}

	return pid;
}

static int run_forked_proc(int (*proc)(void *), void *data) {
	int status;
	pid_t pid;

	if ((pid = xfork()) == 0)
		exit((*proc)(data));
	if (waitpid(pid, &status, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return WIFEXITED(status) ? WEXITSTATUS(status): -2;
}

static int check_events(int fd, int timeo) {
	struct pollfd pfd;

	fprintf(stdout, "Checking events for fd %d\n", fd);
	memset(&pfd, 0, sizeof(pfd));
	pfd.fd = fd;
	pfd.events = POLLIN | POLLOUT;
	if (poll(&pfd, 1, timeo) < 0) {
		perror("poll()");
		return 0;
	}
	if (pfd.revents & POLLIN)
		fprintf(stdout, "\tPOLLIN\n");
	if (pfd.revents & POLLOUT)
		fprintf(stdout, "\tPOLLOUT\n");
	if (pfd.revents & POLLERR)
		fprintf(stdout, "\tPOLLERR\n");
	if (pfd.revents & POLLHUP)
		fprintf(stdout, "\tPOLLHUP\n");
	if (pfd.revents & POLLRDHUP)
		fprintf(stdout, "\tPOLLRDHUP\n");

	return pfd.revents;
}

static int epoll_test_tty(void *data) {
	int epfd, ifd = fileno(stdin), res;
	struct epoll_event evt;

	if (check_events(ifd, 0) != POLLOUT) {
		fprintf(stderr, "Something is cooking on STDIN (%d)\n", ifd);
		return 1;
	}
	epfd = xepoll_create(1);
	fprintf(stdout, "Created epoll fd (%d)\n", epfd);
	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;
	xepoll_ctl(epfd, EPOLL_CTL_ADD, ifd, &evt);
	if (check_events(epfd, 0) & POLLIN) {
		res = epoll_wait(epfd, &evt, 1, 0);
		if (res == 0) {
			fprintf(stderr, "Epoll fd (%d) is ready when it shouldn't!\n",
				epfd);
			return 2;
		}
	}

	return 0;
}

static int epoll_wakeup_chain(void *data) {
	struct epoll_test_cfg *tcfg = data;
	int i, res, epfd, bfd, nfd, pfds[2];
	pid_t pid;
	struct epoll_event evt;

	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;

	epfd = bfd = xepoll_create(1);

	for (i = 0; i < tcfg->size; i++) {
		nfd = xepoll_create(1);
		xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
		bfd = nfd;
	}
	xpipe(pfds);
	if (tcfg->flags & EPOLL_TF_LOOP)
	{
		xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
		/*
		 * If we're testing for loop, we want that the wakeup
		 * triggered by the write to the pipe done in the child
		 * process, triggers a fake event. So we add the pipe
		 * read size with EPOLLOUT events. This will trigger
		 * an addition to the ready-list, but no real events
		 * will be there. The the epoll kernel code will proceed
		 * in calling f_op->poll() of the epfd, triggering the
		 * loop we want to test.
		 */
		evt.events = EPOLLOUT;
	}
	xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

	/*
	 * The pipe write must come after the poll(2) call inside
	 * check_events(). This tests the nested wakeup code in
	 * fs/eventpoll.c:ep_poll_safewake()
	 * By having the check_events() (hence poll(2)) happens first,
	 * we have poll wait queue filled up, and the write(2) in the
	 * child will trigger the wakeup chain.
	 */
	if ((pid = xfork()) == 0) {
		sleep(1);
		write(pfds[1], "w", 1);
		exit(0);
	}

	res = check_events(epfd, 2000) & POLLIN;

	if (waitpid(pid, NULL, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return res;
}

static int epoll_poll_chain(void *data) {
	struct epoll_test_cfg *tcfg = data;
	int i, res, epfd, bfd, nfd, pfds[2];
	pid_t pid;
	struct epoll_event evt;

	memset(&evt, 0, sizeof(evt));
	evt.events = EPOLLIN;

	epfd = bfd = xepoll_create(1);

	for (i = 0; i < tcfg->size; i++) {
		nfd = xepoll_create(1);
		xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt);
		bfd = nfd;
	}
	xpipe(pfds);
	if (tcfg->flags & EPOLL_TF_LOOP)
	{
		xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt);
		/*
		 * If we're testing for loop, we want that the wakeup
		 * triggered by the write to the pipe done in the child
		 * process, triggers a fake event. So we add the pipe
		 * read size with EPOLLOUT events. This will trigger
		 * an addition to the ready-list, but no real events
		 * will be there. The the epoll kernel code will proceed
		 * in calling f_op->poll() of the epfd, triggering the
		 * loop we want to test.
		 */
		evt.events = EPOLLOUT;
	}
	xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt);

	/*
	 * The pipe write mush come before the poll(2) call inside
	 * check_events(). This tests the nested f_op->poll calls code in
	 * fs/eventpoll.c:ep_eventpoll_poll()
	 * By having the pipe write(2) happen first, we make the kernel
	 * epoll code to load the ready lists, and the following poll(2)
	 * done inside check_events() will test nested poll code in
	 * ep_eventpoll_poll().
	 */
	if ((pid = xfork()) == 0) {
		write(pfds[1], "w", 1);
		exit(0);
	}
	sleep(1);
	res = check_events(epfd, 1000) & POLLIN;

	if (waitpid(pid, NULL, 0) != pid) {
		perror("waitpid");
		return -1;
	}

	return res;
}

int main(int ac, char **av) {
	int error;
	struct epoll_test_cfg tcfg;

	fprintf(stdout, "\n********** Testing TTY events\n");
	error = run_forked_proc(epoll_test_tty, NULL);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing short wakeup chain\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == POLLIN ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = EPOLL_MAX_CHAIN;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing long wakeup chain (HOLD ON)\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing short poll chain\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == POLLIN ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = EPOLL_MAX_CHAIN;
	tcfg.flags = 0;
	fprintf(stdout, "\n********** Testing long poll chain (HOLD ON)\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = EPOLL_TF_LOOP;
	fprintf(stdout, "\n********** Testing loopy wakeup chain (HOLD ON)\n");
	error = run_forked_proc(epoll_wakeup_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	tcfg.size = 3;
	tcfg.flags = EPOLL_TF_LOOP;
	fprintf(stdout, "\n********** Testing loopy poll chain (HOLD ON)\n");
	error = run_forked_proc(epoll_poll_chain, &tcfg);
	fprintf(stdout, error == 0 ?
		"********** OK\n": "********** FAIL (%d)\n", error);

	return 0;
}

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Pavel Pisa <pisa@cmp.felk.cvut.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-01 08:59:18 -07:00
..
9p vfs: simple_set_mnt() should return void 2009-03-27 14:44:03 -04:00
adfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
affs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
afs proc 2/2: remove struct proc_dir_entry::owner 2009-03-31 01:14:44 +04:00
autofs constify dentry_operations: autofs, autofs4 2009-03-27 14:44:00 -04:00
autofs4 constify dentry_operations: autofs, autofs4 2009-03-27 14:44:00 -04:00
befs fs/Kconfig: move befs out 2009-01-22 13:15:57 +03:00
bfs fs/Kconfig: move bfs out 2009-01-22 13:15:57 +03:00
btrfs fs: fix page_mkwrite error cases in core code and btrfs 2009-04-01 08:59:14 -07:00
cifs proc 2/2: remove struct proc_dir_entry::owner 2009-03-31 01:14:44 +04:00
coda constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
configfs constify dentry_operations: configfs 2009-03-27 14:44:03 -04:00
cramfs fs/Kconfig: move cramfs out 2009-01-22 13:15:58 +03:00
debugfs debugfs: add helpers for exporting a size_t simple value 2009-01-07 10:00:16 -08:00
devpts Merge code for single and multiple-instance mounts 2009-03-27 14:44:04 -04:00
dlm dlm: fix length calculation in compat code 2009-03-11 12:23:59 -05:00
ecryptfs constify dentry_operations: ecryptfs 2009-03-27 14:44:01 -04:00
efs fs/Kconfig: move efs out 2009-01-22 13:15:57 +03:00
exportfs
ext2 ext2: Zero our b_size in ext2_quota_read() 2009-03-26 02:18:38 +01:00
ext3 Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-quota-2.6 2009-03-27 14:48:34 -07:00
ext4 mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
fat constify dentry_operations: FAT 2009-03-27 14:44:01 -04:00
freevxfs fs/Kconfig: move vxfs out 2009-01-22 13:15:58 +03:00
fuse mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
gfs2 mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
hfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
hfsplus constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
hostfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
hpfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
hppfs
hugetlbfs mm: reintroduce and deprecate rlimit based access for SHM_HUGETLB 2009-04-01 08:59:12 -07:00
isofs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
jbd jbd: fix return value of journal_start_commit() 2009-02-11 14:25:35 -08:00
jbd2 jbd2: Avoid possible NULL dereference in jbd2_journal_begin_ordered_truncate() 2009-02-10 11:15:34 -05:00
jffs2 [JFFS2] fix mount crash caused by removed nodes 2009-02-21 11:09:29 +01:00
jfs proc 2/2: remove struct proc_dir_entry::owner 2009-03-31 01:14:44 +04:00
lockd NLM: Fix GRANT callback address comparison when IPv6 is enabled 2009-03-10 20:33:20 -04:00
minix Update my email address 2009-03-22 11:28:37 -07:00
ncpfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
nfs mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
nfs_common
nfsd Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-quota-2.6 2009-03-27 14:48:34 -07:00
nls
notify fs: avoid I_NEW inodes 2009-03-27 14:44:05 -04:00
ntfs ntfs: remove private wrapper of endian helpers 2009-04-01 08:59:18 -07:00
ocfs2 mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
omfs fs/Kconfig: move omfs out 2009-01-22 13:15:58 +03:00
openpromfs
partitions Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6 2009-03-26 16:04:22 -07:00
proc proc tty: remove struct tty_operations::read_proc 2009-04-01 08:59:10 -07:00
qnx4 fs/Kconfig: move qnx4 out 2009-01-22 13:15:59 +03:00
quota Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
ramfs ramfs-nommu: use generic lru cache 2009-04-01 08:59:15 -07:00
reiserfs Merge branch 'proc-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/adobriyan/proc 2009-03-30 16:06:04 -07:00
romfs fs/Kconfig: move romfs out 2009-01-22 13:15:59 +03:00
smbfs constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
squashfs Squashfs: Valid filesystems are flagged as bad by the corrupted fs patch 2009-03-12 03:23:48 +00:00
sysfs mm: page_mkwrite change prototype to match fault: fix sysfs 2009-04-01 08:59:14 -07:00
sysv constify dentry_operations: misc filesystems 2009-03-27 14:44:00 -04:00
ubifs mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
udf udf: Use lowercase names of quota functions 2009-03-26 02:18:36 +01:00
ufs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
xfs mm: page_mkwrite change prototype to match fault 2009-04-01 08:59:14 -07:00
aio.c aio: lookup_ioctx can return the wrong value when looking up a bogus context 2009-03-19 15:57:18 -07:00
anon_inodes.c constify dentry_operations: rest 2009-03-27 14:44:03 -04:00
attr.c vfs: Use lowercase names of quota functions 2009-03-26 02:18:35 +01:00
bad_inode.c
binfmt_aout.c
binfmt_elf_fdpic.c FDPIC: Don't attempt to expand the userspace stack to fill the space allocated 2009-01-08 12:04:47 +00:00
binfmt_elf.c elf core dump: fix get_user use 2009-02-06 17:34:07 -08:00
binfmt_em86.c
binfmt_flat.c FLAT: Don't attempt to expand the userspace stack to fill the space allocated 2009-01-08 12:04:47 +00:00
binfmt_misc.c fs/binfmt_misc.c: add terminating newline to /proc/sys/fs/binfmt_misc/status 2009-01-06 15:59:19 -08:00
binfmt_script.c
binfmt_som.c
bio-integrity.c block: add private bio_set for bio integrity allocations 2009-03-24 12:35:17 +01:00
bio.c block: add private bio_set for bio integrity allocations 2009-03-24 12:35:17 +01:00
block_dev.c fs: move bdev code out of buffer.c 2009-03-27 14:44:03 -04:00
buffer.c filesystem freeze: allow SysRq emergency thaw to thaw frozen filesystems 2009-04-01 08:59:17 -07:00
char_dev.c fs: fix name overwrite in __register_chrdev_region() 2009-01-06 15:59:13 -08:00
compat_binfmt_elf.c
compat_ioctl.c Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6 2009-03-01 21:35:16 -08:00
compat.c fix setuid sometimes doesn't 2009-03-28 17:30:00 -07:00
dcache.c cleanup d_add_ci 2009-03-27 14:43:57 -04:00
dcookies.c [CVE-2009-0029] System call wrapper special cases 2009-01-14 14:15:18 +01:00
direct-io.c
drop_caches.c fs: avoid I_NEW inodes 2009-03-27 14:44:05 -04:00
eventfd.c [CVE-2009-0029] System call wrappers part 32 2009-01-14 14:15:31 +01:00
eventpoll.c epoll: fix epoll's own poll 2009-04-01 08:59:18 -07:00
exec.c fix setuid sometimes doesn't 2009-03-28 17:30:00 -07:00
fcntl.c Fix a lockdep warning in fasync_helper() 2009-03-30 08:00:24 -06:00
fifo.c
file_table.c Merge branch 'bkl-removal' of git://git.lwn.net/linux-2.6 2009-03-26 16:14:02 -07:00
file.c
filesystems.c [CVE-2009-0029] System call wrappers part 27 2009-01-14 14:15:29 +01:00
fs-writeback.c fs: new inode i_state corruption fix 2009-03-12 16:20:24 -07:00
generic_acl.c
inode.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
internal.h fix setuid sometimes doesn't 2009-03-28 17:30:00 -07:00
ioctl.c Rationalize fasync return values 2009-03-16 08:34:35 -06:00
ioprio.c [CVE-2009-0029] System call wrappers part 28 2009-01-14 14:15:30 +01:00
Kconfig quota: Move quota files into separate directory 2009-03-26 02:18:35 +01:00
Kconfig.binfmt CORE_DUMP_DEFAULT_ELF_HEADERS depends on ELF_CORE 2009-01-09 16:54:41 -08:00
libfs.c vfs: simple_set_mnt() should return void 2009-03-27 14:44:03 -04:00
locks.c [CVE-2009-0029] System call wrappers part 16 2009-01-14 14:15:25 +01:00
Makefile quota: Move quota files into separate directory 2009-03-26 02:18:35 +01:00
mbcache.c
mpage.c
namei.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
namespace.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
nfsctl.c [CVE-2009-0029] System call wrappers part 27 2009-01-14 14:15:29 +01:00
no-block.c
open.c vfs: Use lowercase names of quota functions 2009-03-26 02:18:35 +01:00
pipe.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
pnode.c
pnode.h
posix_acl.c
read_write.c [CVE-2009-0029] System call wrappers part 20 2009-01-14 14:15:26 +01:00
read_write.h
readdir.c [CVE-2009-0029] System call wrappers part 32 2009-01-14 14:15:31 +01:00
select.c [CVE-2009-0029] System call wrappers part 32 2009-01-14 14:15:31 +01:00
seq_file.c cpumask: fix seq_bitmap_*() functions. 2009-03-30 22:05:11 +10:30
signalfd.c [CVE-2009-0029] System call wrappers part 31 2009-01-14 14:15:31 +01:00
splice.c [CVE-2009-0029] System call wrappers part 31 2009-01-14 14:15:31 +01:00
stack.c
stat.c [CVE-2009-0029] System call wrappers part 30 2009-01-14 14:15:30 +01:00
super.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 2009-03-27 16:23:12 -07:00
sync.c Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-quota-2.6 2009-03-27 14:48:34 -07:00
timerfd.c timerfd: add flags check 2009-02-18 15:37:53 -08:00
utimes.c [CVE-2009-0029] System call wrappers part 30 2009-01-14 14:15:30 +01:00
xattr_acl.c
xattr.c [CVE-2009-0029] System call wrappers part 13 2009-01-14 14:15:23 +01:00