mirror of
https://github.com/torvalds/linux.git
synced 2024-12-24 11:51:27 +00:00
797b4cffdf
Make reiserfs to write via generic routines. Original reiserfs write optimized for big writes is deadlock rone Signed-off-by: Vladimir Saveliev <vs@namesys.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
310 lines
10 KiB
C
310 lines
10 KiB
C
/*
|
|
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
|
|
*/
|
|
|
|
#include <linux/time.h>
|
|
#include <linux/reiserfs_fs.h>
|
|
#include <linux/reiserfs_acl.h>
|
|
#include <linux/reiserfs_xattr.h>
|
|
#include <asm/uaccess.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/buffer_head.h>
|
|
#include <linux/quotaops.h>
|
|
|
|
/*
|
|
** We pack the tails of files on file close, not at the time they are written.
|
|
** This implies an unnecessary copy of the tail and an unnecessary indirect item
|
|
** insertion/balancing, for files that are written in one write.
|
|
** It avoids unnecessary tail packings (balances) for files that are written in
|
|
** multiple writes and are small enough to have tails.
|
|
**
|
|
** file_release is called by the VFS layer when the file is closed. If
|
|
** this is the last open file descriptor, and the file
|
|
** small enough to have a tail, and the tail is currently in an
|
|
** unformatted node, the tail is converted back into a direct item.
|
|
**
|
|
** We use reiserfs_truncate_file to pack the tail, since it already has
|
|
** all the conditions coded.
|
|
*/
|
|
static int reiserfs_file_release(struct inode *inode, struct file *filp)
|
|
{
|
|
|
|
struct reiserfs_transaction_handle th;
|
|
int err;
|
|
int jbegin_failure = 0;
|
|
|
|
BUG_ON(!S_ISREG(inode->i_mode));
|
|
|
|
/* fast out for when nothing needs to be done */
|
|
if ((atomic_read(&inode->i_count) > 1 ||
|
|
!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
|
|
!tail_has_to_be_packed(inode)) &&
|
|
REISERFS_I(inode)->i_prealloc_count <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
|
|
mutex_lock(&(REISERFS_I(inode)->i_mmap));
|
|
if (REISERFS_I(inode)->i_flags & i_ever_mapped)
|
|
REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
|
|
|
|
reiserfs_write_lock(inode->i_sb);
|
|
/* freeing preallocation only involves relogging blocks that
|
|
* are already in the current transaction. preallocation gets
|
|
* freed at the end of each transaction, so it is impossible for
|
|
* us to log any additional blocks (including quota blocks)
|
|
*/
|
|
err = journal_begin(&th, inode->i_sb, 1);
|
|
if (err) {
|
|
/* uh oh, we can't allow the inode to go away while there
|
|
* is still preallocation blocks pending. Try to join the
|
|
* aborted transaction
|
|
*/
|
|
jbegin_failure = err;
|
|
err = journal_join_abort(&th, inode->i_sb, 1);
|
|
|
|
if (err) {
|
|
/* hmpf, our choices here aren't good. We can pin the inode
|
|
* which will disallow unmount from every happening, we can
|
|
* do nothing, which will corrupt random memory on unmount,
|
|
* or we can forcibly remove the file from the preallocation
|
|
* list, which will leak blocks on disk. Lets pin the inode
|
|
* and let the admin know what is going on.
|
|
*/
|
|
igrab(inode);
|
|
reiserfs_warning(inode->i_sb,
|
|
"pinning inode %lu because the "
|
|
"preallocation can't be freed",
|
|
inode->i_ino);
|
|
goto out;
|
|
}
|
|
}
|
|
reiserfs_update_inode_transaction(inode);
|
|
|
|
#ifdef REISERFS_PREALLOCATE
|
|
reiserfs_discard_prealloc(&th, inode);
|
|
#endif
|
|
err = journal_end(&th, inode->i_sb, 1);
|
|
|
|
/* copy back the error code from journal_begin */
|
|
if (!err)
|
|
err = jbegin_failure;
|
|
|
|
if (!err && atomic_read(&inode->i_count) <= 1 &&
|
|
(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
|
|
tail_has_to_be_packed(inode)) {
|
|
/* if regular file is released by last holder and it has been
|
|
appended (we append by unformatted node only) or its direct
|
|
item(s) had to be converted, then it may have to be
|
|
indirect2direct converted */
|
|
err = reiserfs_truncate_file(inode, 0);
|
|
}
|
|
out:
|
|
mutex_unlock(&(REISERFS_I(inode)->i_mmap));
|
|
mutex_unlock(&inode->i_mutex);
|
|
reiserfs_write_unlock(inode->i_sb);
|
|
return err;
|
|
}
|
|
|
|
static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
struct inode *inode;
|
|
|
|
inode = file->f_path.dentry->d_inode;
|
|
mutex_lock(&(REISERFS_I(inode)->i_mmap));
|
|
REISERFS_I(inode)->i_flags |= i_ever_mapped;
|
|
mutex_unlock(&(REISERFS_I(inode)->i_mmap));
|
|
|
|
return generic_file_mmap(file, vma);
|
|
}
|
|
|
|
static void reiserfs_vfs_truncate_file(struct inode *inode)
|
|
{
|
|
reiserfs_truncate_file(inode, 1);
|
|
}
|
|
|
|
/* Sync a reiserfs file. */
|
|
|
|
/*
|
|
* FIXME: sync_mapping_buffers() never has anything to sync. Can
|
|
* be removed...
|
|
*/
|
|
|
|
static int reiserfs_sync_file(struct file *p_s_filp,
|
|
struct dentry *p_s_dentry, int datasync)
|
|
{
|
|
struct inode *p_s_inode = p_s_dentry->d_inode;
|
|
int n_err;
|
|
int barrier_done;
|
|
|
|
BUG_ON(!S_ISREG(p_s_inode->i_mode));
|
|
n_err = sync_mapping_buffers(p_s_inode->i_mapping);
|
|
reiserfs_write_lock(p_s_inode->i_sb);
|
|
barrier_done = reiserfs_commit_for_inode(p_s_inode);
|
|
reiserfs_write_unlock(p_s_inode->i_sb);
|
|
if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
|
|
blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
|
|
if (barrier_done < 0)
|
|
return barrier_done;
|
|
return (n_err < 0) ? -EIO : 0;
|
|
}
|
|
|
|
/* taken fs/buffer.c:__block_commit_write */
|
|
int reiserfs_commit_page(struct inode *inode, struct page *page,
|
|
unsigned from, unsigned to)
|
|
{
|
|
unsigned block_start, block_end;
|
|
int partial = 0;
|
|
unsigned blocksize;
|
|
struct buffer_head *bh, *head;
|
|
unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
|
|
int new;
|
|
int logit = reiserfs_file_data_log(inode);
|
|
struct super_block *s = inode->i_sb;
|
|
int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
|
|
struct reiserfs_transaction_handle th;
|
|
int ret = 0;
|
|
|
|
th.t_trans_id = 0;
|
|
blocksize = 1 << inode->i_blkbits;
|
|
|
|
if (logit) {
|
|
reiserfs_write_lock(s);
|
|
ret = journal_begin(&th, s, bh_per_page + 1);
|
|
if (ret)
|
|
goto drop_write_lock;
|
|
reiserfs_update_inode_transaction(inode);
|
|
}
|
|
for (bh = head = page_buffers(page), block_start = 0;
|
|
bh != head || !block_start;
|
|
block_start = block_end, bh = bh->b_this_page) {
|
|
|
|
new = buffer_new(bh);
|
|
clear_buffer_new(bh);
|
|
block_end = block_start + blocksize;
|
|
if (block_end <= from || block_start >= to) {
|
|
if (!buffer_uptodate(bh))
|
|
partial = 1;
|
|
} else {
|
|
set_buffer_uptodate(bh);
|
|
if (logit) {
|
|
reiserfs_prepare_for_journal(s, bh, 1);
|
|
journal_mark_dirty(&th, s, bh);
|
|
} else if (!buffer_dirty(bh)) {
|
|
mark_buffer_dirty(bh);
|
|
/* do data=ordered on any page past the end
|
|
* of file and any buffer marked BH_New.
|
|
*/
|
|
if (reiserfs_data_ordered(inode->i_sb) &&
|
|
(new || page->index >= i_size_index)) {
|
|
reiserfs_add_ordered_list(inode, bh);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (logit) {
|
|
ret = journal_end(&th, s, bh_per_page + 1);
|
|
drop_write_lock:
|
|
reiserfs_write_unlock(s);
|
|
}
|
|
/*
|
|
* If this is a partial write which happened to make all buffers
|
|
* uptodate then we can optimize away a bogus readpage() for
|
|
* the next read(). Here we 'discover' whether the page went
|
|
* uptodate as a result of this (potentially partial) write.
|
|
*/
|
|
if (!partial)
|
|
SetPageUptodate(page);
|
|
return ret;
|
|
}
|
|
|
|
/* Write @count bytes at position @ppos in a file indicated by @file
|
|
from the buffer @buf.
|
|
|
|
generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
|
|
something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
|
|
written for (ext2/3). This is for several reasons:
|
|
|
|
* It has no understanding of any filesystem specific optimizations.
|
|
|
|
* It enters the filesystem repeatedly for each page that is written.
|
|
|
|
* It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
|
|
* operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
|
|
* to reiserfs which allows for fewer tree traversals.
|
|
|
|
* Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
|
|
|
|
* Asking the block allocation code for blocks one at a time is slightly less efficient.
|
|
|
|
All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
|
|
use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make
|
|
things right finally.
|
|
|
|
Future Features: providing search_by_key with hints.
|
|
|
|
*/
|
|
static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */
|
|
const char __user * buf, /* pointer to user supplied data
|
|
(in userspace) */
|
|
size_t count, /* amount of bytes to write */
|
|
loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to
|
|
* new current position before returning. */
|
|
)
|
|
{
|
|
struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to.
|
|
/* To simplify coding at this time, we store
|
|
locked pages in array for now */
|
|
struct reiserfs_transaction_handle th;
|
|
th.t_trans_id = 0;
|
|
|
|
/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
|
|
* lying around (most of the disk, in fact). Despite the filesystem
|
|
* now being a v3.6 format, the old items still can't support large
|
|
* file sizes. Catch this case here, as the rest of the VFS layer is
|
|
* oblivious to the different limitations between old and new items.
|
|
* reiserfs_setattr catches this for truncates. This chunk is lifted
|
|
* from generic_write_checks. */
|
|
if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
|
|
*ppos + count > MAX_NON_LFS) {
|
|
if (*ppos >= MAX_NON_LFS) {
|
|
return -EFBIG;
|
|
}
|
|
if (count > MAX_NON_LFS - (unsigned long)*ppos)
|
|
count = MAX_NON_LFS - (unsigned long)*ppos;
|
|
}
|
|
|
|
return do_sync_write(file, buf, count, ppos);
|
|
}
|
|
|
|
const struct file_operations reiserfs_file_operations = {
|
|
.read = do_sync_read,
|
|
.write = reiserfs_file_write,
|
|
.ioctl = reiserfs_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = reiserfs_compat_ioctl,
|
|
#endif
|
|
.mmap = reiserfs_file_mmap,
|
|
.open = generic_file_open,
|
|
.release = reiserfs_file_release,
|
|
.fsync = reiserfs_sync_file,
|
|
.aio_read = generic_file_aio_read,
|
|
.aio_write = generic_file_aio_write,
|
|
.splice_read = generic_file_splice_read,
|
|
.splice_write = generic_file_splice_write,
|
|
};
|
|
|
|
const struct inode_operations reiserfs_file_inode_operations = {
|
|
.truncate = reiserfs_vfs_truncate_file,
|
|
.setattr = reiserfs_setattr,
|
|
.setxattr = reiserfs_setxattr,
|
|
.getxattr = reiserfs_getxattr,
|
|
.listxattr = reiserfs_listxattr,
|
|
.removexattr = reiserfs_removexattr,
|
|
.permission = reiserfs_permission,
|
|
};
|