pnfsblock: write_pagelist handle zero invalid extents

For invalid extents, find other pages in the same fsblock and write them out.

[pnfsblock: write_begin]
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
This commit is contained in:
Peng Tao 2011-07-30 20:52:56 -04:00 committed by Trond Myklebust
parent 31e6306a40
commit 71cdd40fd4

View File

@ -35,6 +35,7 @@
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/bio.h> /* struct bio */ #include <linux/bio.h> /* struct bio */
#include <linux/buffer_head.h> /* various write calls */
#include "blocklayout.h" #include "blocklayout.h"
@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
*/ */
static int is_writable(struct pnfs_block_extent *be, sector_t isect) static int is_writable(struct pnfs_block_extent *be, sector_t isect)
{ {
if (be->be_state == PNFS_BLOCK_READWRITE_DATA) return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
return 1; be->be_state == PNFS_BLOCK_INVALID_DATA);
else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
return 0;
else
return bl_is_sector_init(be->be_inval, isect);
} }
/* The data we are handed might be spread across several bios. We need /* The data we are handed might be spread across several bios. We need
@ -353,6 +350,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
} }
} }
static void bl_end_io_write_zero(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
do {
struct page *page = bvec->bv_page;
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
/* This is the zeroing page we added */
end_page_writeback(page);
page_cache_release(page);
} while (bvec >= bio->bi_io_vec);
if (!uptodate) {
if (!wdata->pnfs_error)
wdata->pnfs_error = -EIO;
bl_set_lo_fail(wdata->lseg);
}
bio_put(bio);
put_parallel(par);
}
/* This is basically copied from mpage_end_io_read */ /* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err) static void bl_end_io_write(struct bio *bio, int err)
{ {
@ -379,11 +401,8 @@ static void bl_write_cleanup(struct work_struct *work)
dprintk("%s enter\n", __func__); dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work); task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task); wdata = container_of(task, struct nfs_write_data, task);
if (!wdata->task.tk_status) { if (!wdata->pnfs_error) {
/* Marks for LAYOUTCOMMIT */ /* Marks for LAYOUTCOMMIT */
/* BUG - this should be called after each bio, not after
* all finish, unless have some way of storing success/failure
*/
mark_extents_written(BLK_LSEG2EXT(wdata->lseg), mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
wdata->args.offset, wdata->args.count); wdata->args.offset, wdata->args.count);
} }
@ -391,38 +410,110 @@ static void bl_write_cleanup(struct work_struct *work)
} }
/* Called when last of bios associated with a bl_write_pagelist call finishes */ /* Called when last of bios associated with a bl_write_pagelist call finishes */
static void static void bl_end_par_io_write(void *data)
bl_end_par_io_write(void *data)
{ {
struct nfs_write_data *wdata = data; struct nfs_write_data *wdata = data;
/* STUB - ignoring error handling */
wdata->task.tk_status = 0; wdata->task.tk_status = 0;
wdata->verf.committed = NFS_FILE_SYNC; wdata->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
schedule_work(&wdata->task.u.tk_work); schedule_work(&wdata->task.u.tk_work);
} }
/* FIXME STUB - mark intersection of layout and page as bad, so is not
* used again.
*/
static void mark_bad_read(void)
{
return;
}
/*
* map_block: map a requested I/0 block (isect) into an offset in the LVM
* block_device
*/
static void
map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
{
dprintk("%s enter be=%p\n", __func__, be);
set_buffer_mapped(bh);
bh->b_bdev = be->be_mdev;
bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
(be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
__func__, (unsigned long long)isect, (long)bh->b_blocknr,
bh->b_size);
return;
}
/* Given an unmapped page, zero it or read in page for COW, page is locked
* by caller.
*/
static int
init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
{
struct buffer_head *bh = NULL;
int ret = 0;
sector_t isect;
dprintk("%s enter, %p\n", __func__, page);
BUG_ON(PageUptodate(page));
if (!cow_read) {
zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
goto cleanup;
}
bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
if (!bh) {
ret = -ENOMEM;
goto cleanup;
}
isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
map_block(bh, isect, cow_read);
if (!bh_uptodate_or_lock(bh))
ret = bh_submit_read(bh);
if (ret)
goto cleanup;
SetPageUptodate(page);
cleanup:
bl_put_extent(cow_read);
if (bh)
free_buffer_head(bh);
if (ret) {
/* Need to mark layout with bad read...should now
* just use nfs4 for reads and writes.
*/
mark_bad_read();
}
return ret;
}
static enum pnfs_try_status static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data *wdata, int sync) bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{ {
int i; int i, ret, npg_zero, pg_index, last = 0;
struct bio *bio = NULL; struct bio *bio = NULL;
struct pnfs_block_extent *be = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL;
sector_t isect, extent_length = 0; sector_t isect, last_isect = 0, extent_length = 0;
struct parallel_io *par; struct parallel_io *par;
loff_t offset = wdata->args.offset; loff_t offset = wdata->args.offset;
size_t count = wdata->args.count; size_t count = wdata->args.count;
struct page **pages = wdata->args.pages; struct page **pages = wdata->args.pages;
int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; struct page *page;
pgoff_t index;
u64 temp;
int npg_per_block =
NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
/* At this point, wdata->pages is a (sequential) list of nfs_pages. /* At this point, wdata->pages is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error remove it from * We want to write each, and if there is an error set pnfs_error
* list and call * to have it redone using nfs.
* nfs_retry_request(req) to have it redone using nfs.
* QUEST? Do as block or per req? Think have to do per block
* as part of end_bio
*/ */
par = alloc_parallel(wdata); par = alloc_parallel(wdata);
if (!par) if (!par)
@ -433,7 +524,91 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
/* At this point, have to be more careful with error handling */ /* At this point, have to be more careful with error handling */
isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
for (i = pg_index; i < wdata->npages ; i++) { be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
if (!be || !is_writable(be, isect)) {
dprintk("%s no matching extents!\n", __func__);
wdata->pnfs_error = -EINVAL;
goto out;
}
/* First page inside INVALID extent */
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
temp = offset >> PAGE_CACHE_SHIFT;
npg_zero = do_div(temp, npg_per_block);
isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
(long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
extent_length = be->be_length - (isect - be->be_f_offset);
fill_invalid_ext:
dprintk("%s need to zero %d pages\n", __func__, npg_zero);
for (;npg_zero > 0; npg_zero--) {
/* page ref released in bl_end_io_write_zero */
index = isect >> PAGE_CACHE_SECTOR_SHIFT;
dprintk("%s zero %dth page: index %lu isect %llu\n",
__func__, npg_zero, index,
(unsigned long long)isect);
page =
find_or_create_page(wdata->inode->i_mapping, index,
GFP_NOFS);
if (!page) {
dprintk("%s oom\n", __func__);
wdata->pnfs_error = -ENOMEM;
goto out;
}
/* PageDirty: Other will write this out
* PageWriteback: Other is writing this out
* PageUptodate: It was read before
* sector_initialized: already written out
*/
if (PageDirty(page) || PageWriteback(page) ||
bl_is_sector_init(be->be_inval, isect)) {
print_page(page);
unlock_page(page);
page_cache_release(page);
goto next_page;
}
if (!PageUptodate(page)) {
/* New page, readin or zero it */
init_page_for_write(page, cow_read);
}
set_page_writeback(page);
unlock_page(page);
ret = bl_mark_sectors_init(be->be_inval, isect,
PAGE_CACHE_SECTORS,
NULL);
if (unlikely(ret)) {
dprintk("%s bl_mark_sectors_init fail %d\n",
__func__, ret);
end_page_writeback(page);
page_cache_release(page);
wdata->pnfs_error = ret;
goto out;
}
bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
isect, page, be,
bl_end_io_write_zero, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
goto out;
}
/* FIXME: This should be done in bi_end_io */
mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
page->index << PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE);
next_page:
isect += PAGE_CACHE_SECTORS;
extent_length -= PAGE_CACHE_SECTORS;
}
if (last)
goto write_done;
}
bio = bl_submit_bio(WRITE, bio);
/* Middle pages */
pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
for (i = pg_index; i < wdata->npages; i++) {
if (!extent_length) { if (!extent_length) {
/* We've used up the previous extent */ /* We've used up the previous extent */
bl_put_extent(be); bl_put_extent(be);
@ -442,35 +617,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
isect, NULL); isect, NULL);
if (!be || !is_writable(be, isect)) { if (!be || !is_writable(be, isect)) {
wdata->pnfs_error = -ENOMEM; wdata->pnfs_error = -EINVAL;
goto out; goto out;
} }
extent_length = be->be_length - extent_length = be->be_length -
(isect - be->be_f_offset); (isect - be->be_f_offset);
} }
for (;;) { if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
if (!bio) { ret = bl_mark_sectors_init(be->be_inval, isect,
bio = bio_alloc(GFP_NOIO, wdata->npages - i); PAGE_CACHE_SECTORS,
if (!bio) { NULL);
wdata->pnfs_error = -ENOMEM; if (unlikely(ret)) {
goto out; dprintk("%s bl_mark_sectors_init fail %d\n",
} __func__, ret);
bio->bi_sector = isect - be->be_f_offset + wdata->pnfs_error = ret;
be->be_v_offset; goto out;
bio->bi_bdev = be->be_mdev;
bio->bi_end_io = bl_end_io_write;
bio->bi_private = par;
} }
if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) }
break; bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
bio = bl_submit_bio(WRITE, bio); isect, pages[i], be,
bl_end_io_write, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
goto out;
} }
isect += PAGE_CACHE_SECTORS; isect += PAGE_CACHE_SECTORS;
last_isect = isect;
extent_length -= PAGE_CACHE_SECTORS; extent_length -= PAGE_CACHE_SECTORS;
} }
wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
if (count < wdata->res.count) /* Last page inside INVALID extent */
if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
bio = bl_submit_bio(WRITE, bio);
temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
npg_zero = npg_per_block - do_div(temp, npg_per_block);
if (npg_zero < npg_per_block) {
last = 1;
goto fill_invalid_ext;
}
}
write_done:
wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
if (count < wdata->res.count) {
wdata->res.count = count; wdata->res.count = count;
}
out: out:
bl_put_extent(be); bl_put_extent(be);
bl_submit_bio(WRITE, bio); bl_submit_bio(WRITE, bio);