fuse: implement NFS-like readdirplus support

This patch implements readdirplus support in FUSE, similar to NFS.
The payload returned in the readdirplus call contains
'fuse_entry_out' structure thereby providing all the necessary inputs
for 'faking' a lookup() operation on the spot.

If the dentry and inode already existed (for e.g. in a re-run of ls -l)
then just the inode attributes timeout and dentry timeout are refreshed.

With a simple client->network->server implementation of a FUSE based
filesystem, the following performance observations were made:

Test: Performing a filesystem crawl over 20,000 files with

sh# time ls -lR /mnt

Without readdirplus:
Run 1: 18.1s
Run 2: 16.0s
Run 3: 16.2s

With readdirplus:
Run 1: 4.1s
Run 2: 3.8s
Run 3: 3.8s

The performance improvement is significant as it avoided 20,000 upcalls
calls (lookup). Cache consistency is no worse than what already is.

Signed-off-by: Anand V. Avati <avati@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
This commit is contained in:
Anand V. Avati 2012-08-19 08:53:23 -04:00 committed by Miklos Szeredi
parent ff7532ca2c
commit 0b05b18381
5 changed files with 197 additions and 5 deletions

View File

@ -491,6 +491,25 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fuse_request_send_nowait_locked(fc, req);
}
void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
struct fuse_forget_in inarg;
memset(&inarg, 0, sizeof(inarg));
inarg.nlookup = 1;
req = fuse_get_req_nofail(fc, file);
req->in.h.opcode = FUSE_FORGET;
req->in.h.nodeid = nodeid;
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
req->isreply = 0;
fuse_request_send_nowait(fc, req);
}
/*
* Lock the request. Up to the next unlock_request() there mustn't be
* anything that could cause a page-fault. If the request was already

View File

@ -1155,6 +1155,143 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
return 0;
}
static int fuse_direntplus_link(struct file *file,
struct fuse_direntplus *direntplus,
u64 attr_version)
{
int err;
struct fuse_entry_out *o = &direntplus->entry_out;
struct fuse_dirent *dirent = &direntplus->dirent;
struct dentry *parent = file->f_path.dentry;
struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
struct dentry *dentry;
struct dentry *alias;
struct inode *dir = parent->d_inode;
struct fuse_conn *fc;
struct inode *inode;
if (!o->nodeid) {
/*
* Unlike in the case of fuse_lookup, zero nodeid does not mean
* ENOENT. Instead, it only means the userspace filesystem did
* not want to return attributes/handle for this entry.
*
* So do nothing.
*/
return 0;
}
if (name.name[0] == '.') {
/*
* We could potentially refresh the attributes of the directory
* and its parent?
*/
if (name.len == 1)
return 0;
if (name.name[1] == '.' && name.len == 2)
return 0;
}
fc = get_fuse_conn(dir);
name.hash = full_name_hash(name.name, name.len);
dentry = d_lookup(parent, &name);
if (dentry && dentry->d_inode) {
inode = dentry->d_inode;
if (get_node_id(inode) == o->nodeid) {
struct fuse_inode *fi;
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
fi->nlookup++;
spin_unlock(&fc->lock);
/*
* The other branch to 'found' comes via fuse_iget()
* which bumps nlookup inside
*/
goto found;
}
err = d_invalidate(dentry);
if (err)
goto out;
dput(dentry);
dentry = NULL;
}
dentry = d_alloc(parent, &name);
err = -ENOMEM;
if (!dentry)
goto out;
inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
&o->attr, entry_attr_timeout(o), attr_version);
if (!inode)
goto out;
alias = d_materialise_unique(dentry, inode);
err = PTR_ERR(alias);
if (IS_ERR(alias))
goto out;
if (alias) {
dput(dentry);
dentry = alias;
}
found:
fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
attr_version);
fuse_change_entry_timeout(dentry, o);
err = 0;
out:
if (dentry)
dput(dentry);
return err;
}
static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
void *dstbuf, filldir_t filldir, u64 attr_version)
{
struct fuse_direntplus *direntplus;
struct fuse_dirent *dirent;
size_t reclen;
int over = 0;
int ret;
while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
direntplus = (struct fuse_direntplus *) buf;
dirent = &direntplus->dirent;
reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
return -EIO;
if (reclen > nbytes)
break;
if (!over) {
/* We fill entries into dstbuf only as much as
it can hold. But we still continue iterating
over remaining entries to link them. If not,
we need to send a FORGET for each of those
which we did not link.
*/
over = filldir(dstbuf, dirent->name, dirent->namelen,
file->f_pos, dirent->ino,
dirent->type);
file->f_pos = dirent->off;
}
buf += reclen;
nbytes -= reclen;
ret = fuse_direntplus_link(file, direntplus, attr_version);
if (ret)
fuse_force_forget(file, direntplus->entry_out.nodeid);
}
return 0;
}
static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
{
int err;
@ -1163,6 +1300,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
struct inode *inode = file->f_path.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
u64 attr_version = 0;
if (is_bad_inode(inode))
return -EIO;
@ -1179,14 +1317,28 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
if (fc->do_readdirplus) {
attr_version = fuse_get_attr_version(fc);
fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
FUSE_READDIRPLUS);
} else {
fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
FUSE_READDIR);
}
fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err)
err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
filldir);
if (!err) {
if (fc->do_readdirplus) {
err = parse_dirplusfile(page_address(page), nbytes,
file, dstbuf, filldir,
attr_version);
} else {
err = parse_dirfile(page_address(page), nbytes, file,
dstbuf, filldir);
}
}
__free_page(page);
fuse_invalidate_attr(inode); /* atime changed */

View File

@ -487,6 +487,9 @@ struct fuse_conn {
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
/** Does the filesystem support readdir-plus? */
unsigned do_readdirplus:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
@ -578,6 +581,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
/* Used by READDIRPLUS */
void fuse_force_forget(struct file *file, u64 nodeid);
/**
* Initialize READ or READDIR request
*/

View File

@ -863,6 +863,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->dont_mask = 1;
if (arg->flags & FUSE_AUTO_INVAL_DATA)
fc->auto_inval_data = 1;
if (arg->flags & FUSE_DO_READDIRPLUS)
fc->do_readdirplus = 1;
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@ -889,7 +891,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
FUSE_DO_READDIRPLUS;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);

View File

@ -193,6 +193,7 @@ struct fuse_file_lock {
#define FUSE_FLOCK_LOCKS (1 << 10)
#define FUSE_HAS_IOCTL_DIR (1 << 11)
#define FUSE_AUTO_INVAL_DATA (1 << 12)
#define FUSE_DO_READDIRPLUS (1 << 13)
/**
* CUSE INIT request/reply flags
@ -299,6 +300,7 @@ enum fuse_opcode {
FUSE_NOTIFY_REPLY = 41,
FUSE_BATCH_FORGET = 42,
FUSE_FALLOCATE = 43,
FUSE_READDIRPLUS = 44,
/* CUSE specific operations */
CUSE_INIT = 4096,
@ -630,6 +632,16 @@ struct fuse_dirent {
#define FUSE_DIRENT_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
struct fuse_direntplus {
struct fuse_entry_out entry_out;
struct fuse_dirent dirent;
};
#define FUSE_NAME_OFFSET_DIRENTPLUS \
offsetof(struct fuse_direntplus, dirent.name)
#define FUSE_DIRENTPLUS_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
struct fuse_notify_inval_inode_out {
__u64 ino;
__s64 off;