diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7b3003cb6f1b..952aeb048349 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return parse_options(sb, data); } diff --git a/fs/affs/super.c b/fs/affs/super.c index d098731b82ff..307453086c3f 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -530,6 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); + sync_filesystem(sb); *flags |= MS_NODIRATIME; memcpy(volume, sbi->s_volume, 32); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 5188f1222987..d626756ff721 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) static int befs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EINVAL; return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d4878ddba87a..9dbf42395153 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1380,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + sync_filesystem(sb); btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ab8ad2546c3e..2c70cbe35d39 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 626abc02b694..d9c7751f10ac 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -96,6 +96,7 @@ void coda_destroy_inodecache(void) static int coda_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a1f801c14fbc..ddcfe590b8a8 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -243,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb) static int cramfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index ca4a08f38374..8c41b52da358 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) int err; struct debugfs_fs_info *fsi = sb->s_fs_info; + sync_filesystem(sb); err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index a726b9f29cb7..c71038079b47 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; + sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* diff --git a/fs/efs/super.c b/fs/efs/super.c index f8def1acf08c..3befcc9f5d63 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -114,6 +114,7 @@ static void destroy_inodecache(void) static int efs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 20d6697bd638..d260115c0350 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; + sync_filesystem(sb); spin_lock(&sbi->s_lock); /* Store the old options */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 37fd31ed16e7..95c6c5a6d0c5 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif + sync_filesystem(sb); + /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d3a534fdc5ff..f1c65dc7cc0a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif @@ -567,6 +568,8 @@ enum { #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Do not put hole in extent cache */ #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 /* * The bit position of these flags must not overlap with any of the @@ -998,6 +1001,8 @@ struct ext4_inode_info { #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1326,6 +1331,7 @@ struct ext4_sb_info { struct list_head s_es_lru; unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; + struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -2757,6 +2761,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2766,6 +2771,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3fe29de832c8..c3fb607413ed 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74bc2d549c58..82df3ce9874a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include "ext4_jbd2.h" @@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + if (ext4_ext_is_uninitialized(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) + return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; @@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; + int merge_done = 0, uninit; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); + if (uninit) + ext4_ext_mark_uninitialized(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0; + int mb_flags = 0, uninit; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, path + depth); if (err) return err; - + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1971,10 +1980,13 @@ prepend: if (err) return err; + uninit = ext4_ext_is_uninitialized(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + /* + * If we're starting with an extent other than the last one in the + * node, we need to see if it shares a cluster with the extent to + * the right (towards the end of the file). If its leftmost cluster + * is this extent's rightmost cluster and it is not cluster aligned, + * we'll mark it as a partial that is not to be deallocated. + */ + + if (ex != EXT_LAST_EXTENT(eh)) { + ext4_fsblk_t current_pblk, right_pblk; + long long current_cluster, right_cluster; + + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); + right_pblk = ext4_ext_pblock(ex + 1); + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); + if (current_cluster == right_cluster && + EXT4_PBLK_COFF(sbi, right_pblk)) + *partial_cluster = -right_cluster; + } + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && @@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * Free the partial cluster only if the current extent does not - * reference it. Otherwise we might free used cluster. + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If there's a partial cluster and no extents + * remain in the leaf, it can't be freed here. It can only be + * freed when it's possible to determine if it's not shared with + * any other extent - when the next leaf is processed or when space + * removal is complete. */ - if (*partial_cluster > 0 && + if (*partial_cluster > 0 && eh->eh_entries && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { int flags = get_default_free_blocks_flags(inode); @@ -3569,6 +3607,8 @@ out: * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after * the uninitialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit @@ -3579,7 +3619,7 @@ out: * * Returns the size of uninitialized extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, @@ -3591,9 +3631,9 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3608,14 +3648,73 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); } +static int ext4_convert_initialized_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as uninitialized */ + ext4_ext_mark_uninitialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3649,8 +3748,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_unwritten_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT); + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT); if (err < 0) goto out; ext4_ext_drop_refs(path); @@ -3850,6 +3949,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, return allocated_clusters; } +static int +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * uninitialized extent + */ + if (map->m_len > EXT_UNINIT_MAX_LEN) + map->m_len = EXT_UNINIT_MAX_LEN / 2; + + ret = ext4_convert_initialized_extents(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + + return err ? err : allocated; +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3877,8 +4008,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + ret = ext4_split_convert_extents(handle, inode, map, + path, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -3993,10 +4124,6 @@ out1: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -4128,7 +4255,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4170,6 +4297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* * Uninitialized extents are treated as holes, except that * we split out initialized portions during a write. @@ -4186,13 +4314,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if (!ext4_ext_is_uninitialized(ex)) + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_uninitialized(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = ext4_ext_convert_initialized_extent( + handle, inode, map, path, flags, + allocated, newblock); + goto out2; + } else if (!ext4_ext_is_uninitialized(ex)) goto out; - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - goto out3; + if (ret < 0) + err = ret; + else + allocated = ret; + goto out2; } } @@ -4473,7 +4615,6 @@ out2: kfree(path); } -out3: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); ext4_es_lru_add(inode); @@ -4514,34 +4655,200 @@ retry: ext4_std_error(inode->i_sb, err); } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, int flags, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - } + map.m_lblk = offset; /* - * Update only when preallocation was requested beyond - * the file size. + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (len <= EXT_UNINIT_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + +retry: + while (ret >= 0 && ret < len) { + map.m_lblk = map.m_lblk + ret; + map.m_len = len = len - ret; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int partial; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; + } + + /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + + /* + * Indirect files do not support unwritten extnets + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + /* + * If we have a partial block after EOF we have to allocate + * the entire block. + */ + if (partial) + max_blocks += 1; + } + + if (max_blocks > 0) { + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Remove entire range from the extent status tree. + */ + ret = ext4_es_remove_extent(inode, lblk, max_blocks); + if (ret) + goto out_dio; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, + mode); + if (ret) + goto out_dio; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { if (new_size > i_size_read(inode)) i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); } else { /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (new_size > i_size_read(inode)) + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4555,22 +4862,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + struct timespec tv; + unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4582,83 +4892,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); + trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } + - lblk; + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - inode->i_ino, map.m_lblk, - map.m_len, ret); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; + mutex_lock(&inode->i_mutex); - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - if ((file->f_flags & O_SYNC) && ret >= max_blocks) - ext4_handle_sync(handle); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + if (ret) + goto out; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + goto out; + + tv = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { + if (new_size > i_size_read(inode)) { + i_size_write(inode, new_size); + inode->i_mtime = tv; + } + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4869,3 +5162,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_es_lru_add(inode); return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = ex_last->ee_block + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + ex_start->ee_block -= shift; + if (ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) { + if (ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + } + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + path[depth].p_idx->ei_block -= shift; + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; + } + + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + depth = path->p_depth; + extent = path[depth].p_ext; + ex_start = extent->ee_block; + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + current_block = extent->ee_block; + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; + break; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) + break; + } + + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size; + int ret; + + BUG_ON(offset + len > i_size_read(inode)); + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || + len & (EXT4_BLOCK_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache_range(inode, offset, -1); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + truncate_setsize(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3981ff783950..0a014a7194b2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u) %llu %llx", + printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); @@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " - "want to add an delayed/hole extent " - "[%d/%d/%llu/%llx]\n", + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, @@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " - "to add an written/unwritten extent " - "[%d/%d/%llu/%llx]\n", inode->i_ino, + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } @@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, */ pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " - "delayed/hole extent [%d/%d/%llu/%llx]\n", + "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " - "an written extent [%d/%d/%llu/%llx]\n", + "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_cache_extent(inode, &newes); if (!len) @@ -812,13 +810,13 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; + block = 0x7FDEADBEEF; if (ext4_es_is_written(&orig_es) || - ext4_es_is_unwritten(&orig_es)) { + ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; - ext4_es_store_pblock(&newes, block); - } - ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes); if (err) { es->es_lblk = orig_es.es_lblk; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 167f4ab8ecc3..f1b62a419920 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es, (es->es_pblk & ~ES_MASK)); } +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (pb & ~ES_MASK)); +} + extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 175c3f933816..5b0d2c7d5408 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; + int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -515,6 +516,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -553,7 +560,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_KEEP_SIZE); } if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -580,7 +586,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -597,7 +603,13 @@ found: * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* * Here we clear m_flags because after allocating an new extent, @@ -653,7 +665,6 @@ found: ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -688,7 +699,7 @@ found: has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -3312,26 +3323,6 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must @@ -3339,7 +3330,7 @@ int ext4_block_truncate_page(handle_t *handle, * the end of the block it will be shortened to end of the block * that cooresponds to 'from' */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3429,6 +3420,26 @@ unlock: return err; } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { @@ -3502,7 +3513,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length); + trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions @@ -3609,6 +3620,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3682,7 +3699,7 @@ void ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode - * or it completely new indode. In those cases we might not + * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,8 +3951,8 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4154,11 +4171,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -4328,8 +4347,7 @@ static int ext4_do_update_inode(handle_t *handle, goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4374,12 +4392,15 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_block[block] = ei->i_data[block]; } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } ext4_inode_csum_set(inode, raw_inode, ei); @@ -4446,7 +4467,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4456,7 +4482,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a2a837f00407..0f2252ec274d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb, struct ext4_inode_info *ei_bl; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { - err = -EINVAL; - goto swap_boot_out; - } + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto swap_boot_out; - } + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); - if (IS_ERR(inode_bl)) { - err = PTR_ERR(inode_bl); - goto swap_boot_out; - } + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); filemap_flush(inode->i_mapping); @@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_mark_inode_dirty(handle, inode); } } - ext4_journal_stop(handle); - ext4_double_up_write_data_sem(inode, inode_bl); journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); - unlock_two_nondirectories(inode, inode_bl); - iput(inode_bl); - -swap_boot_out: return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 04a5c7504be9..a888cac76e9c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; @@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -4006,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 08481ee84cd5..d634e183b4d4 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug; } \ } while (0) #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ @@ -175,8 +175,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 773b503bd18c..58ee7dc87669 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -static int +int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { @@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } if (!buffer_mapped(bh)) { zero_user(page, block_start, blocksize); - if (!err) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 710fed2377d4..f3c667091618 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,6 +59,7 @@ static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; @@ -940,7 +945,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), @@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + goto failed_mount; + } + } + if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " @@ -4010,6 +4025,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: + if (ext4_mballoc_ready) { + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; + } + } + /* * Get the # of file system overhead blocks from the * superblock if present. @@ -4835,6 +4858,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; @@ -5515,12 +5541,10 @@ static int __init ext4_init_fs(void) goto out4; err = ext4_init_mballoc(); - if (err) - goto out3; - - err = ext4_init_xattr(); if (err) goto out2; + else + ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5536,10 +5560,9 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_exit_xattr(); -out2: + ext4_mballoc_ready = 0; ext4_exit_mballoc(); -out3: +out2: ext4_exit_feat_adverts(); out4: if (ext4_proc_root) @@ -5562,7 +5585,6 @@ static void __exit ext4_exit_fs(void) unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - ext4_exit_xattr(); ext4_exit_mballoc(); ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e175e94116ac..1f5cf5880718 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -81,7 +81,7 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); @@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *, static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); -static struct mb_cache *ext4_xattr_cache; - static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_mb_cache) + static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) @@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -286,7 +288,7 @@ bad_block: error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); if (error == -EIO) @@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, { struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; @@ -567,12 +571,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } @@ -745,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &bs->s; struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, bs->bh->b_blocknr); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -769,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); - ext4_xattr_cache_insert(bs->bh); + ext4_xattr_cache_insert(ext4_mb_cache, + bs->bh); } unlock_buffer(bs->bh); if (error == -EIO) @@ -905,7 +912,7 @@ getblk_failed: memcpy(new_bh->b_data, s->base, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(new_bh); + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, new_bh); if (error) @@ -1228,7 +1235,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t min_offs, free; - int total_ino, total_blk; + int total_ino; void *base, *start, *end; int extra_isize = 0, error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1286,8 +1293,7 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); + free = ext4_xattr_free_space(first, &min_offs, base, NULL); if (free < new_extra_isize) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); if (!ce) { ea_bdebug(bh, "out of memory"); return; @@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, hash); while (ce) { struct buffer_head *bh; @@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -ext4_init_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(char *name) { - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); - if (!ext4_xattr_cache) - return -ENOMEM; - return 0; + return mb_cache_create(name, HASH_BUCKET_BITS); } -void -ext4_exit_xattr(void) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { - if (ext4_xattr_cache) - mb_cache_destroy(ext4_xattr_cache); - ext4_xattr_cache = NULL; + if (cache) + mb_cache_destroy(cache); } + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 819d6398833f..29bedf5589f6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); - extern const struct xattr_handler *ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, @@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a85f83abd53..856bdf994c0a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -568,6 +568,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; int err, active_logs; + sync_filesystem(sb); + /* * Save the old mount options in case we * need to restore them. diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c68d9f27135e..b3361fe2bcb5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + sync_filesystem(sb); + /* make sure we update state on remount. */ new_rdonly = *flags & MS_RDONLY; if (new_rdonly != (sb->s_flags & MS_RDONLY)) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index e37eb274e492..7ca8c75d50d3 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) static int vxfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b4bff1b15028..8d611696fcad 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode) static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (*flags & MS_MANDLOCK) return -EINVAL; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 033ee975a895..de8afad89e51 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1167,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) struct gfs2_tune *gt = &sdp->sd_tune; int error; + sync_filesystem(sb); + spin_lock(>->gt_spin); args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 2d2039e754cd..eee7206c38d1 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index a6abf87d79d0..a513d2d36be9 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (!(*flags & MS_RDONLY)) { diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 4534ff688b76..fe3463a43236 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); + sync_filesystem(s); + *flags |= MS_NOATIME; hpfs_lock(s); diff --git a/fs/inode.c b/fs/inode.c index fb59ba7967f1..f96d2a6f88cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1898,3 +1898,34 @@ void inode_dio_done(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } EXPORT_SYMBOL(inode_dio_done); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 4a9e10ea13f2..6af66ee56390 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -117,6 +117,7 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EROFS; return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf2fc0594063..5f26139a165a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, &log_bufs, WRITE_SYNC); - blk_finish_plug(&plug); jbd_debug(3, "JBD2: commit phase 2b\n"); @@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) err = 0; bufs = 0; descriptor = NULL; - blk_start_plug(&plug); while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ @@ -1067,6 +1065,25 @@ restart_loop: goto restart_loop; } + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + /* Done with this transaction! */ jbd_debug(3, "JBD2: commit phase 7\n"); @@ -1085,24 +1102,7 @@ restart_loop: atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); - - /* - * Calculate overall stats - */ - spin_lock(&journal->j_history_lock); - journal->j_stats.ts_tid++; - if (commit_transaction->t_requested) - journal->j_stats.ts_requested++; - journal->j_stats.run.rs_wait += stats.run.rs_wait; - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; - journal->j_stats.run.rs_running += stats.run.rs_running; - journal->j_stats.run.rs_locked += stats.run.rs_locked; - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; - journal->j_stats.run.rs_logging += stats.run.rs_logging; - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; - spin_unlock(&journal->j_history_lock); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); @@ -1122,24 +1122,6 @@ restart_loop: write_unlock(&journal->j_state_lock); - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - spin_unlock(&journal->j_list_lock); - /* Drop all spin_locks because commit_callback may be block. - * __journal_remove_checkpoint() can not destroy transaction - * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); @@ -1150,7 +1132,7 @@ restart_loop: write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; - /* Recheck checkpoint lists after j_list_lock was dropped */ + /* Check if the transaction can be dropped now that we are finished */ if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); @@ -1159,4 +1141,21 @@ restart_loop: spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5fa344afb49a..67b8e303946c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ -int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) return sb->s_checksum == jbd2_superblock_csum(j, sb); } -void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; @@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal) journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { - wake_up(&journal->j_wait_commit); write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); write_lock(&journal->j_state_lock); } @@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 60bb365f54a5..38cfcf5f6fce 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * reused here. */ jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; } spin_unlock(&journal->j_list_lock); @@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) journal->j_running_transaction)) { printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " - "journal->j_running_transaction (%p, %u)", + "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, @@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - if (unlikely(jh->b_transaction != - journal->j_committing_transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_transaction (%llu, %p, %u) != " - "journal->j_committing_transaction (%p, %u)", + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, jh->b_transaction, - jh->b_transaction ? jh->b_transaction->t_tid : 0, - journal->j_committing_transaction, - journal->j_committing_transaction ? - journal->j_committing_transaction->t_tid : 0); - ret = -EINVAL; - } - if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_next_transaction (%llu, %p, %u) != " - "transaction (%p, %u)", - journal->j_devname, - (unsigned long long) bh->b_blocknr, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, - transaction, transaction->t_tid); + jh->b_jlist); + WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another @@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); if (!buffer_jbd(bh)) goto not_jbd; @@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) * we know to remove the checkpoint after we commit. */ + spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); @@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) goto drop; } } + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); @@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = NULL; + spin_unlock(&journal->j_list_lock); /* * only drop a reference if this transaction modified @@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) } not_jbd: - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != NULL) + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL) { /* written-back checkpointed metadata buffer */ JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0defb1cc2a35..0918f0e2e266 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); int err; + sync_filesystem(sb); err = jffs2_parse_options(c, data); if (err) return -EINVAL; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e2b7483444fd..97f7fda51890 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) int flag = JFS_SBI(sb)->flag; int ret; + sync_filesystem(sb); if (!parse_options(data, sb, &newLVSize, &flag)) { return -EINVAL; } diff --git a/fs/mbcache.c b/fs/mbcache.c index e519e45bf673..bf166e388f0d 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -26,6 +26,41 @@ * back on the lru list. */ +/* + * Lock descriptions and usage: + * + * Each hash chain of both the block and index hash tables now contains + * a built-in lock used to serialize accesses to the hash chain. + * + * Accesses to global data structures mb_cache_list and mb_cache_lru_list + * are serialized via the global spinlock mb_cache_spinlock. + * + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize + * accesses to its local data, such as e_used and e_queued. + * + * Lock ordering: + * + * Each block hash chain's lock has the highest lock order, followed by an + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's + * lock), and mb_cach_spinlock, with the lowest order. While holding + * either a block or index hash chain lock, a thread can acquire an + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. + * + * Synchronization: + * + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and + * index hash chian, it needs to lock the corresponding hash chain. For each + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to + * prevent either any simultaneous release or free on the entry and also + * to serialize accesses to either the e_used or e_queued member of the entry. + * + * To avoid having a dangling reference to an already freed + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a + * block hash chain and also no longer being referenced, both e_used, + * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is + * first removed from a block hash chain. + */ + #include #include @@ -34,9 +69,10 @@ #include #include #include -#include +#include #include - +#include +#include #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -57,8 +93,14 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) +#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ + (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) + static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); - +static struct blockgroup_lock *mb_cache_bg_lock; +static struct kmem_cache *mb_cache_kmem_cache; + MODULE_AUTHOR("Andreas Gruenbacher "); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL"); @@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); +static inline void +__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_lock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + +static inline void +__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + static inline int -__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) { - return !list_empty(&ce->e_block_list); + return !hlist_bl_unhashed(&ce->e_block_list); } -static void -__mb_cache_entry_unhash(struct mb_cache_entry *ce) +static inline void +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) { - if (__mb_cache_entry_is_hashed(ce)) { - list_del_init(&ce->e_block_list); - list_del(&ce->e_index.o_list); - } + if (__mb_cache_entry_is_block_hashed(ce)) + hlist_bl_del_init(&ce->e_block_list); } +static inline int +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_index.o_list); +} + +static inline void +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_index_hashed(ce)) + hlist_bl_del_init(&ce->e_index.o_list); +} + +/* + * __mb_cache_entry_unhash_unlock() + * + * This function is called to unhash both the block and index hash + * chain. + * It assumes both the block and index hash chain is locked upon entry. + * It also unlock both hash chains both exit + */ +static inline void +__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +{ + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); +} static void __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) { struct mb_cache *cache = ce->e_cache; - mb_assert(!(ce->e_used || ce->e_queued)); + mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); kmem_cache_free(cache->c_entry_cache, ce); atomic_dec(&cache->c_entry_count); } - static void -__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) - __releases(mb_cache_spinlock) +__mb_cache_entry_release(struct mb_cache_entry *ce) { + /* First lock the entry to serialize access to its local data. */ + __spin_lock_mb_cache_entry(ce); /* Wake up all processes queuing for this cache entry. */ if (ce->e_queued) wake_up_all(&mb_cache_queue); if (ce->e_used >= MB_CACHE_WRITER) ce->e_used -= MB_CACHE_WRITER; + /* + * Make sure that all cache entries on lru_list have + * both e_used and e_qued of 0s. + */ ce->e_used--; - if (!(ce->e_used || ce->e_queued)) { - if (!__mb_cache_entry_is_hashed(ce)) + if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { + if (!__mb_cache_entry_is_block_hashed(ce)) { + __spin_unlock_mb_cache_entry(ce); goto forget; - mb_assert(list_empty(&ce->e_lru_list)); - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + } + /* + * Need access to lru list, first drop entry lock, + * then reacquire the lock in the proper order. + */ + spin_lock(&mb_cache_spinlock); + if (list_empty(&ce->e_lru_list)) + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + spin_unlock(&mb_cache_spinlock); } - spin_unlock(&mb_cache_spinlock); + __spin_unlock_mb_cache_entry(ce); return; forget: - spin_unlock(&mb_cache_spinlock); + mb_assert(list_empty(&ce->e_lru_list)); __mb_cache_entry_forget(ce, GFP_KERNEL); } - /* * mb_cache_shrink_scan() memory pressure callback * @@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { + while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { struct mb_cache_entry *ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - freed++; + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* Prevent any find or get operation on the entry */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + list_add_tail(&ce->e_lru_list, &free_list); + spin_lock(&mb_cache_spinlock); } spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { __mb_cache_entry_forget(entry, gfp_mask); + freed++; } return freed; } @@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits) int n, bucket_count = 1 << bucket_bits; struct mb_cache *cache = NULL; + if (!mb_cache_bg_lock) { + mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), + GFP_KERNEL); + if (!mb_cache_bg_lock) + return NULL; + bgl_lock_init(mb_cache_bg_lock); + } + cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) return NULL; cache->c_name = name; atomic_set(&cache->c_entry_count, 0); cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + cache->c_block_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_block_hash) goto fail; for (n=0; nc_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_index_hash) goto fail; for (n=0; nc_index_hash[n]); - cache->c_entry_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!cache->c_entry_cache) - goto fail2; + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); + if (!mb_cache_kmem_cache) { + mb_cache_kmem_cache = kmem_cache_create(name, + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!mb_cache_kmem_cache) + goto fail2; + } + cache->c_entry_cache = mb_cache_kmem_cache; /* * Set an upper limit on the number of cache entries so that the hash @@ -273,21 +395,47 @@ void mb_cache_shrink(struct block_device *bdev) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct list_head *l; + struct mb_cache_entry *ce, *tmp; + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); if (ce->e_bdev == bdev) { - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + list_add_tail(&ce->e_lru_list, &free_list); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); } } spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(ce, GFP_KERNEL); } } @@ -303,23 +451,27 @@ void mb_cache_destroy(struct mb_cache *cache) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct mb_cache_entry *ce, *tmp; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { + list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { + if (ce->e_cache == cache) list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - } } list_del(&cache->c_cache_list); spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + list_del_init(&ce->e_lru_list); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + __mb_cache_entry_forget(ce, GFP_KERNEL); } if (atomic_read(&cache->c_entry_count) > 0) { @@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache) atomic_read(&cache->c_entry_count)); } - kmem_cache_destroy(cache->c_entry_cache); - + if (list_empty(&mb_cache_list)) { + kmem_cache_destroy(mb_cache_kmem_cache); + mb_cache_kmem_cache = NULL; + } kfree(cache->c_index_hash); kfree(cache->c_block_hash); kfree(cache); @@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache) struct mb_cache_entry * mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) { - struct mb_cache_entry *ce = NULL; + struct mb_cache_entry *ce; if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + struct list_head *l; + + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - if (!list_empty(&mb_cache_lru_list)) { - ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - __mb_cache_entry_unhash(ce); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the + * entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + mb_assert(list_empty(&ce->e_lru_list)); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + goto found; + } } spin_unlock(&mb_cache_spinlock); } - if (!ce) { - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_LIST_HEAD(&ce->e_block_list); - ce->e_cache = cache; - ce->e_queued = 0; - } + + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; + atomic_inc(&cache->c_entry_count); + INIT_LIST_HEAD(&ce->e_lru_list); + INIT_HLIST_BL_NODE(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_index.o_list); + ce->e_cache = cache; + ce->e_queued = 0; + atomic_set(&ce->e_refcnt, 0); +found: + ce->e_block_hash_p = &cache->c_block_hash[0]; + ce->e_index_hash_p = &cache->c_index_hash[0]; ce->e_used = 1 + MB_CACHE_WRITER; return ce; } @@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, { struct mb_cache *cache = ce->e_cache; unsigned int bucket; - struct list_head *l; - int error = -EBUSY; + struct hlist_bl_node *l; + struct hlist_bl_head *block_hash_p; + struct hlist_bl_head *index_hash_p; + struct mb_cache_entry *lce; + mb_assert(ce); bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each_prev(l, &cache->c_block_hash[bucket]) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_block_list); - if (ce->e_bdev == bdev && ce->e_block == block) - goto out; + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { + if (lce->e_bdev == bdev && lce->e_block == block) { + hlist_bl_unlock(block_hash_p); + return -EBUSY; + } } - __mb_cache_entry_unhash(ce); + mb_assert(!__mb_cache_entry_is_block_hashed(ce)); + __mb_cache_entry_unhash_block(ce); + __mb_cache_entry_unhash_index(ce); ce->e_bdev = bdev; ce->e_block = block; - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); + ce->e_block_hash_p = block_hash_p; ce->e_index.o_key = key; + hlist_bl_add_head(&ce->e_block_list, block_hash_p); + hlist_bl_unlock(block_hash_p); bucket = hash_long(key, cache->c_bucket_bits); - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); - error = 0; -out: - spin_unlock(&mb_cache_spinlock); - return error; + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + ce->e_index_hash_p = index_hash_p; + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); + hlist_bl_unlock(index_hash_p); + return 0; } @@ -429,24 +625,26 @@ out: void mb_cache_entry_release(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); - __mb_cache_entry_release_unlock(ce); + __mb_cache_entry_release(ce); } /* * mb_cache_entry_free() * - * This is equivalent to the sequence mb_cache_entry_takeout() -- - * mb_cache_entry_release(). */ void mb_cache_entry_free(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); + mb_assert(ce); mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_unhash(ce); - __mb_cache_entry_release_unlock(ce); + hlist_bl_lock(ce->e_index_hash_p); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_lock(ce->e_block_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_release(ce); } @@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, sector_t block) { unsigned int bucket; - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *block_hash_p; bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each(l, &cache->c_block_hash[bucket]) { - ce = list_entry(l, struct mb_cache_entry, e_block_list); + block_hash_p = &cache->c_block_hash[bucket]; + /* First serialize access to the block corresponding hash chain. */ + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { + mb_assert(ce->e_block_hash_p == block_hash_p); if (ce->e_bdev == bdev && ce->e_block == block) { - DEFINE_WAIT(wait); - - if (!list_empty(&ce->e_lru_list)) - list_del_init(&ce->e_lru_list); - - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(block_hash_p); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + if (ce->e_used > 0) { + DEFINE_WAIT(wait); + while (ce->e_used > 0) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); } - finish_wait(&mb_cache_queue, &wait); ce->e_used += 1 + MB_CACHE_WRITER; + __spin_unlock_mb_cache_entry(ce); - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + if (!list_empty(&ce->e_lru_list)) { + spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return NULL; } - goto cleanup; + return ce; } } - ce = NULL; - -cleanup: - spin_unlock(&mb_cache_spinlock); - return ce; + hlist_bl_unlock(block_hash_p); + return NULL; } #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) static struct mb_cache_entry * -__mb_cache_entry_find(struct list_head *l, struct list_head *head, +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, struct block_device *bdev, unsigned int key) { - while (l != head) { + + /* The index hash chain is alredy acquire by caller. */ + while (l != NULL) { struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_index.o_list); + hlist_bl_entry(l, struct mb_cache_entry, + e_index.o_list); + mb_assert(ce->e_index_hash_p == head); if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - DEFINE_WAIT(wait); - - if (!list_empty(&ce->e_lru_list)) - list_del_init(&ce->e_lru_list); - + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(head); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + ce->e_used++; /* Incrementing before holding the lock gives readers priority over writers. */ - ce->e_used++; - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); + if (ce->e_used >= MB_CACHE_WRITER) { + DEFINE_WAIT(wait); - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + while (ce->e_used >= MB_CACHE_WRITER) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + } + __spin_unlock_mb_cache_entry(ce); + if (!list_empty(&ce->e_lru_list)) { spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return ERR_PTR(-EAGAIN); } return ce; } l = l->next; } + hlist_bl_unlock(head); return NULL; } @@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, unsigned int key) { unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; - struct mb_cache_entry *ce; + struct hlist_bl_node *l; + struct mb_cache_entry *ce = NULL; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); - l = cache->c_index_hash[bucket].next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - spin_unlock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + if (!hlist_bl_empty(index_hash_p)) { + l = hlist_bl_first(index_hash_p); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + } else + hlist_bl_unlock(index_hash_p); return ce; } @@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, { struct mb_cache *cache = prev->e_cache; unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + mb_assert(prev->e_index_hash_p == index_hash_p); + hlist_bl_lock(index_hash_p); + mb_assert(!hlist_bl_empty(index_hash_p)); l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - __mb_cache_entry_release_unlock(prev); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + __mb_cache_entry_release(prev); return ce; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 0ad2ec9601de..f007a3355570 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) struct minix_sb_info * sbi = minix_sb(sb); struct minix_super_block * ms; + sync_filesystem(sb); ms = sbi->s_ms; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index ee59d35ff069..647d86d2db39 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -99,6 +99,7 @@ static void destroy_inodecache(void) static int ncp_remount(struct super_block *sb, int *flags, char* data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 910ed906eb82..2cb56943e232 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; u32 nfsvers = nfss->nfs_client->rpc_ops->version; + sync_filesystem(sb); + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7ac2a122ca1d..8c532b2ca3ab 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_mount_opt; int err; + sync_filesystem(sb); old_sb_flags = sb->s_flags; old_mount_opt = nilfs->ns_mount_opt; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 82650d52d916..bd5610d48242 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -468,6 +468,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_debug("Entering with remount options string: %s", opt); + sync_filesystem(sb); + #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ *flags |= MS_RDONLY; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1aecd626e645..a7cdd56f4c79 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -634,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; + sync_filesystem(sb); + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 8c0ceb8dd1f7..15e4500cda3e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) static int openprom_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 7bbeb5257af1..5dbadecb234d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) int proc_remount(struct super_block *sb, int *flags, char *data) { struct pid_namespace *pid = sb->s_fs_info; + + sync_filesystem(sb); return !proc_parse_options(data, pid); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 12823845d324..192297b0090d 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -249,6 +249,7 @@ static void parse_options(char *options) static int pstore_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); parse_options(data); return 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 89558810381c..c4bcb778886e 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) { struct qnx4_sb_info *qs; + sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; *flags |= MS_RDONLY; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 8d941edfefa1..65cdaab3ed49 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root) static int qnx6_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ed54a04c33bd..9fb20426005e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1318,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif + sync_filesystem(s); reiserfs_write_lock(s); #ifdef CONFIG_QUOTA diff --git a/fs/romfs/super.c b/fs/romfs/super.c index d8418782862b..ef90e8bca95a 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static int romfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 202df6312d4e..031c8d67fd51 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int squashfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/super.c b/fs/super.c index 80d5cf2ca765..e9dc3c3fe159 100644 --- a/fs/super.c +++ b/fs/super.c @@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } } - sync_filesystem(sb); - if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); if (retval) { diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 5625ca920f5e..88956309cc86 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); + sync_filesystem(sb); if (sbi->s_forced_ro) *flags |= MS_RDONLY; return 0; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 48f943f7f5d5..a1266089eca1 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) int err; struct ubifs_info *c = sb->s_fs_info; + sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); err = ubifs_parse_options(c, data, 1); diff --git a/fs/udf/super.c b/fs/udf/super.c index 3306b9f69bed..64f2b7334d08 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -646,6 +646,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) int error = 0; struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + sync_filesystem(sb); if (lvidiu) { int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 329f2f53b7ed..b8c6791f046f 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1280,6 +1280,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned new_mount_opt, ufstype; unsigned flags; + sync_filesystem(sb); lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0ef599218991..205376776377 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1197,6 +1197,7 @@ xfs_fs_remount( char *p; int error; + sync_filesystem(sb); while ((p = strsep(&options, ",")) != NULL) { int token; diff --git a/include/linux/fs.h b/include/linux/fs.h index a877ed3f389f..ea80f1cdff06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2572,6 +2572,9 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, void inode_dio_wait(struct inode *inode); void inode_dio_done(struct inode *inode); +extern void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask); + extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 5525d370701d..6a392e7a723a 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -3,19 +3,21 @@ (C) 2001 by Andreas Gruenbacher, */ - struct mb_cache_entry { struct list_head e_lru_list; struct mb_cache *e_cache; unsigned short e_used; unsigned short e_queued; + atomic_t e_refcnt; struct block_device *e_bdev; sector_t e_block; - struct list_head e_block_list; + struct hlist_bl_node e_block_list; struct { - struct list_head o_list; + struct hlist_bl_node o_list; unsigned int o_key; } e_index; + struct hlist_bl_head *e_block_hash_p; + struct hlist_bl_head *e_index_hash_p; }; struct mb_cache { @@ -25,8 +27,8 @@ struct mb_cache { int c_max_entries; int c_bucket_bits; struct kmem_cache *c_entry_cache; - struct list_head *c_block_hash; - struct list_head *c_index_hash; + struct hlist_bl_head *c_block_hash; + struct hlist_bl_head *c_index_hash; }; /* Functions on caches */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 197d3125df2a..010ea89eeb0e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -16,6 +16,15 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; +/* shim until we merge in the xfs_collapse_range branch */ +#ifndef FALLOC_FL_COLLAPSE_RANGE +#define FALLOC_FL_COLLAPSE_RANGE 0x08 +#endif + +#ifndef FALLOC_FL_ZERO_RANGE +#define FALLOC_FL_ZERO_RANGE 0x10 +#endif + #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ @@ -68,6 +77,13 @@ struct extent_status; { EXTENT_STATUS_DELAYED, "D" }, \ { EXTENT_STATUS_HOLE, "H" }) +#define show_falloc_mode(mode) __print_flags(mode, "|", \ + { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ + { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ + { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \ + { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ + { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) + TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), @@ -1328,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit, __entry->rw, __entry->ret) ); -TRACE_EVENT(ext4_fallocate_enter, +DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode), @@ -1336,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( loff_t, pos ) - __field( loff_t, len ) + __field( loff_t, offset ) + __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pos = offset; + __entry->offset = offset; __entry->len = len; __entry->mode = mode; ), - TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d", + TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->pos, - __entry->len, __entry->mode) + (unsigned long) __entry->ino, + __entry->offset, __entry->len, + show_falloc_mode(__entry->mode)) +); + +DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, + + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), + + TP_ARGS(inode, offset, len, mode) +); + +DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, + + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), + + TP_ARGS(inode, offset, len, mode) +); + +DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, + + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), + + TP_ARGS(inode, offset, len, mode) ); TRACE_EVENT(ext4_fallocate_exit, @@ -1384,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit, __entry->ret) ); -TRACE_EVENT(ext4_punch_hole, - TP_PROTO(struct inode *inode, loff_t offset, loff_t len), - - TP_ARGS(inode, offset, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, offset ) - __field( loff_t, len ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->offset = offset; - __entry->len = len; - ), - - TP_printk("dev %d,%d ino %lu offset %lld len %lld", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->offset, __entry->len) -); - TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), @@ -2410,6 +2423,31 @@ TRACE_EVENT(ext4_es_shrink_exit, __entry->shrunk_nr, __entry->cache_cnt) ); +TRACE_EVENT(ext4_collapse_range, + TP_PROTO(struct inode *inode, loff_t offset, loff_t len), + + TP_ARGS(inode, offset, len), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, offset) + __field(loff_t, len) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu offset %lld len %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->offset, __entry->len) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */