ceph: Implement writev/pwritev for sync operation.

For writev/pwritev sync-operatoin, ceph only do the first iov.

I divided the write-sync-operation into two functions. One for
direct-write, other for none-direct-sync-write. This is because for
none-direct-sync-write we can merge iovs to one. But for direct-write,
we can't merge iovs.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
This commit is contained in:
majianpeng 2013-09-12 13:54:26 +08:00 committed by Sage Weil
parent 9f12bd119e
commit e8344e6689

View File

@ -489,83 +489,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
} }
} }
/* /*
* Synchronous write, straight from __user pointer or user pages (if * Synchronous write, straight from __user pointer or user pages.
* O_DIRECT).
* *
* If write spans object boundary, just do multiple writes. (For a * If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all * correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.) * objects, rollback on failure, etc.)
*/ */
static ssize_t ceph_sync_write(struct file *file, const char __user *data, static ssize_t
size_t left, loff_t pos, loff_t *ppos) ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, size_t count)
{ {
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc; struct ceph_snap_context *snapc;
struct ceph_vino vino; struct ceph_vino vino;
struct ceph_osd_request *req; struct ceph_osd_request *req;
int num_ops = 1;
struct page **pages; struct page **pages;
int num_pages; int num_pages;
u64 len;
int written = 0; int written = 0;
int flags; int flags;
int check_caps = 0; int check_caps = 0;
int page_align, io_align; int page_align;
unsigned long buf_align;
int ret; int ret;
struct timespec mtime = CURRENT_TIME; struct timespec mtime = CURRENT_TIME;
bool own_pages = false; loff_t pos = iocb->ki_pos;
struct iov_iter i;
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS; return -EROFS;
dout("sync_write on file %p %lld~%u %s\n", file, pos, dout("sync_direct_write on file %p %lld~%u\n", file, pos,
(unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); (unsigned)count);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0) if (ret < 0)
return ret; return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping, ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT, pos >> PAGE_CACHE_SHIFT,
(pos + left) >> PAGE_CACHE_SHIFT); (pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0) if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret); dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP | flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE; CEPH_OSD_FLAG_WRITE;
if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
flags |= CEPH_OSD_FLAG_ACK;
else
num_ops++; /* Also include a 'startsync' command. */
/* iov_iter_init(&i, iov, nr_segs, count, 0);
* we may need to do multiple writes here if we span an object
* boundary. this isn't atomic, unfortunately. :(
*/
more:
io_align = pos & ~PAGE_MASK;
buf_align = (unsigned long)data & ~PAGE_MASK;
len = left;
snapc = ci->i_snap_realm->cached_context; while (iov_iter_count(&i) > 0) {
vino = ceph_vino(inode); void __user *data = i.iov->iov_base + i.iov_offset;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, u64 len = i.iov->iov_len - i.iov_offset;
vino, pos, &len, num_ops,
CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq, ci->i_truncate_size,
false);
if (IS_ERR(req))
return PTR_ERR(req);
/* write from beginning of first page, regardless of io alignment */ page_align = (unsigned long)data & ~PAGE_MASK;
page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
num_pages = calc_pages_for(page_align, len); snapc = ci->i_snap_realm->cached_context;
if (file->f_flags & O_DIRECT) { vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len,
2,/*include a 'startsync' command*/
CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
goto out;
}
num_pages = calc_pages_for(page_align, len);
pages = ceph_get_direct_page_vector(data, num_pages, false); pages = ceph_get_direct_page_vector(data, num_pages, false);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
ret = PTR_ERR(pages); ret = PTR_ERR(pages);
@ -577,60 +573,175 @@ more:
* may block. * may block.
*/ */
truncate_inode_pages_range(inode->i_mapping, pos, truncate_inode_pages_range(inode->i_mapping, pos,
(pos+len) | (PAGE_CACHE_SIZE-1)); (pos+len) | (PAGE_CACHE_SIZE-1));
} else { osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
false, false);
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_put_page_vector(pages, num_pages, false);
out:
ceph_osdc_put_request(req);
if (ret == 0) {
pos += len;
written += len;
iov_iter_advance(&i, (size_t)len);
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_AUTHONLY,
NULL);
}
} else
break;
}
if (ret != -EOLDSNAPC && written > 0) {
iocb->ki_pos = pos;
ret = written;
}
return ret;
}
/*
* Synchronous write, straight from __user pointer or user pages.
*
* If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
u64 len;
int num_pages;
int written = 0;
int flags;
int check_caps = 0;
int ret;
struct timespec mtime = CURRENT_TIME;
loff_t pos = iocb->ki_pos;
struct iov_iter i;
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT,
(pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ACK;
iov_iter_init(&i, iov, nr_segs, count, 0);
while ((len = iov_iter_count(&i)) > 0) {
size_t left;
int n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 1,
CEPH_OSD_OP_WRITE, flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
goto out;
}
/*
* write from beginning of first page,
* regardless of io alignment
*/
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
ret = PTR_ERR(pages); ret = PTR_ERR(pages);
goto out; goto out;
} }
ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
left = len;
for (n = 0; n < num_pages; n++) {
size_t plen = min(left, PAGE_SIZE);
ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
if (ret != plen) {
ret = -EFAULT;
break;
}
left -= ret;
iov_iter_advance(&i, ret);
}
if (ret < 0) { if (ret < 0) {
ceph_release_page_vector(pages, num_pages); ceph_release_page_vector(pages, num_pages);
goto out; goto out;
} }
if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */
/* get a second commit callback */ req->r_unsafe_callback = ceph_sync_write_unsafe;
req->r_unsafe_callback = ceph_sync_write_unsafe; req->r_inode = inode;
req->r_inode = inode;
own_pages = true;
}
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
false, own_pages);
/* BUG_ON(vino.snap != CEPH_NOSNAP); */ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); false, true);
ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); /* BUG_ON(vino.snap != CEPH_NOSNAP); */
if (!ret) ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (file->f_flags & O_DIRECT) ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
ceph_put_page_vector(pages, num_pages, false); if (!ret)
else if (file->f_flags & O_SYNC) ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_release_page_vector(pages, num_pages);
out: out:
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
if (ret == 0) { if (ret == 0) {
pos += len; pos += len;
written += len; written += len;
left -= len;
data += len;
if (left)
goto more;
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_AUTHONLY,
NULL);
}
} else
break;
}
if (ret != -EOLDSNAPC && written > 0) {
ret = written; ret = written;
*ppos = pos; iocb->ki_pos = pos;
if (pos > i_size_read(inode))
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
NULL);
} else if (ret != -EOLDSNAPC && written > 0) {
ret = written;
} }
return ret; return ret;
} }
@ -772,11 +883,13 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) || (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
(fi->flags & CEPH_F_SYNC)) {
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
written = ceph_sync_write(file, iov->iov_base, count, if (file->f_flags & O_DIRECT)
pos, &iocb->ki_pos); written = ceph_sync_direct_write(iocb, iov,
nr_segs, count);
else
written = ceph_sync_write(iocb, iov, nr_segs, count);
if (written == -EOLDSNAPC) { if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u" dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n", "got EOLDSNAPC, retrying\n",