ext4: Use readahead when reading an inode from the inode table
With modern hard drives, reading 64k takes roughly the same time as reading a 4k block. So request readahead for adjacent inode table blocks to reduce the time it takes when iterating over directories (especially when doing this in htree sort order) in a cold cache case. With this patch, the time it takes to run "git status" on a kernel tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches" is reduced by 21%. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
37515facd0
commit
240799cdf2
@ -177,6 +177,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
|
||||
your disks are battery-backed in one way or another,
|
||||
disabling barriers may safely improve performance.
|
||||
|
||||
inode_readahead=n This tuning parameter controls the maximum
|
||||
number of inode table blocks that ext4's inode
|
||||
table readahead algorithm will pre-read into
|
||||
the buffer cache. The default value is 32 blocks.
|
||||
|
||||
orlov (*) This enables the new Orlov block allocator. It is
|
||||
enabled by default.
|
||||
|
||||
@ -252,6 +257,7 @@ stripe=n Number of filesystem blocks that mballoc will try
|
||||
delalloc (*) Deferring block allocation until write-out time.
|
||||
nodelalloc Disable delayed allocation. Blocks are allocation
|
||||
when data is copied from user to page cache.
|
||||
|
||||
Data Mode
|
||||
=========
|
||||
There are 3 different data modes:
|
||||
|
@ -956,6 +956,9 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
|
||||
files are packed closely together. Each large file
|
||||
will have its blocks allocated out of its own unique
|
||||
preallocation pool.
|
||||
inode_readahead Tuning parameter which controls the maximum number of
|
||||
inode table blocks that ext4's inode table readahead
|
||||
algorithm will pre-read into the buffer cache
|
||||
..............................................................................
|
||||
|
||||
|
||||
|
@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
|
||||
#define EXT4_DEF_RESUID 0
|
||||
#define EXT4_DEF_RESGID 0
|
||||
|
||||
#define EXT4_DEF_INODE_READAHEAD_BLKS 32
|
||||
|
||||
/*
|
||||
* Default mount options
|
||||
*/
|
||||
|
@ -52,6 +52,7 @@ struct ext4_sb_info {
|
||||
int s_desc_per_block_bits;
|
||||
int s_inode_size;
|
||||
int s_first_ino;
|
||||
unsigned int s_inode_readahead_blks;
|
||||
spinlock_t s_next_gen_lock;
|
||||
u32 s_next_generation;
|
||||
u32 s_hash_seed[4];
|
||||
|
128
fs/ext4/inode.c
128
fs/ext4/inode.c
@ -3833,41 +3833,6 @@ out_stop:
|
||||
ext4_journal_stop(handle);
|
||||
}
|
||||
|
||||
static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
|
||||
unsigned long ino, struct ext4_iloc *iloc)
|
||||
{
|
||||
ext4_group_t block_group;
|
||||
unsigned long offset;
|
||||
ext4_fsblk_t block;
|
||||
struct ext4_group_desc *gdp;
|
||||
|
||||
if (!ext4_valid_inum(sb, ino)) {
|
||||
/*
|
||||
* This error is already checked for in namei.c unless we are
|
||||
* looking at an NFS filehandle, in which case no error
|
||||
* report is needed
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
|
||||
gdp = ext4_get_group_desc(sb, block_group, NULL);
|
||||
if (!gdp)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Figure out the offset within the block group inode table
|
||||
*/
|
||||
offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
|
||||
EXT4_INODE_SIZE(sb);
|
||||
block = ext4_inode_table(sb, gdp) +
|
||||
(offset >> EXT4_BLOCK_SIZE_BITS(sb));
|
||||
|
||||
iloc->block_group = block_group;
|
||||
iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
|
||||
return block;
|
||||
}
|
||||
|
||||
/*
|
||||
* ext4_get_inode_loc returns with an extra refcount against the inode's
|
||||
* underlying buffer_head on success. If 'in_mem' is true, we have all
|
||||
@ -3877,18 +3842,34 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
|
||||
static int __ext4_get_inode_loc(struct inode *inode,
|
||||
struct ext4_iloc *iloc, int in_mem)
|
||||
{
|
||||
ext4_fsblk_t block;
|
||||
struct ext4_group_desc *gdp;
|
||||
struct buffer_head *bh;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
ext4_fsblk_t block;
|
||||
int inodes_per_block, inode_offset;
|
||||
|
||||
block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
|
||||
if (!block)
|
||||
iloc->bh = 0;
|
||||
if (!ext4_valid_inum(sb, inode->i_ino))
|
||||
return -EIO;
|
||||
|
||||
bh = sb_getblk(inode->i_sb, block);
|
||||
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
|
||||
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
|
||||
if (!gdp)
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* Figure out the offset within the block group inode table
|
||||
*/
|
||||
inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
|
||||
inode_offset = ((inode->i_ino - 1) %
|
||||
EXT4_INODES_PER_GROUP(sb));
|
||||
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
|
||||
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
|
||||
|
||||
bh = sb_getblk(sb, block);
|
||||
if (!bh) {
|
||||
ext4_error (inode->i_sb, "ext4_get_inode_loc",
|
||||
"unable to read inode block - "
|
||||
"inode=%lu, block=%llu",
|
||||
ext4_error(sb, "ext4_get_inode_loc", "unable to read "
|
||||
"inode block - inode=%lu, block=%llu",
|
||||
inode->i_ino, block);
|
||||
return -EIO;
|
||||
}
|
||||
@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
|
||||
*/
|
||||
if (in_mem) {
|
||||
struct buffer_head *bitmap_bh;
|
||||
struct ext4_group_desc *desc;
|
||||
int inodes_per_buffer;
|
||||
int inode_offset, i;
|
||||
ext4_group_t block_group;
|
||||
int start;
|
||||
int i, start;
|
||||
|
||||
block_group = (inode->i_ino - 1) /
|
||||
EXT4_INODES_PER_GROUP(inode->i_sb);
|
||||
inodes_per_buffer = bh->b_size /
|
||||
EXT4_INODE_SIZE(inode->i_sb);
|
||||
inode_offset = ((inode->i_ino - 1) %
|
||||
EXT4_INODES_PER_GROUP(inode->i_sb));
|
||||
start = inode_offset & ~(inodes_per_buffer - 1);
|
||||
start = inode_offset & ~(inodes_per_block - 1);
|
||||
|
||||
/* Is the inode bitmap in cache? */
|
||||
desc = ext4_get_group_desc(inode->i_sb,
|
||||
block_group, NULL);
|
||||
if (!desc)
|
||||
goto make_io;
|
||||
|
||||
bitmap_bh = sb_getblk(inode->i_sb,
|
||||
ext4_inode_bitmap(inode->i_sb, desc));
|
||||
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
|
||||
if (!bitmap_bh)
|
||||
goto make_io;
|
||||
|
||||
@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
|
||||
brelse(bitmap_bh);
|
||||
goto make_io;
|
||||
}
|
||||
for (i = start; i < start + inodes_per_buffer; i++) {
|
||||
for (i = start; i < start + inodes_per_block; i++) {
|
||||
if (i == inode_offset)
|
||||
continue;
|
||||
if (ext4_test_bit(i, bitmap_bh->b_data))
|
||||
break;
|
||||
}
|
||||
brelse(bitmap_bh);
|
||||
if (i == start + inodes_per_buffer) {
|
||||
if (i == start + inodes_per_block) {
|
||||
/* all other inodes are free, so skip I/O */
|
||||
memset(bh->b_data, 0, bh->b_size);
|
||||
set_buffer_uptodate(bh);
|
||||
@ -3968,6 +3933,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
|
||||
}
|
||||
|
||||
make_io:
|
||||
/*
|
||||
* If we need to do any I/O, try to pre-readahead extra
|
||||
* blocks from the inode table.
|
||||
*/
|
||||
if (EXT4_SB(sb)->s_inode_readahead_blks) {
|
||||
ext4_fsblk_t b, end, table;
|
||||
unsigned num;
|
||||
|
||||
table = ext4_inode_table(sb, gdp);
|
||||
/* Make sure s_inode_readahead_blks is a power of 2 */
|
||||
while (EXT4_SB(sb)->s_inode_readahead_blks &
|
||||
(EXT4_SB(sb)->s_inode_readahead_blks-1))
|
||||
EXT4_SB(sb)->s_inode_readahead_blks =
|
||||
(EXT4_SB(sb)->s_inode_readahead_blks &
|
||||
(EXT4_SB(sb)->s_inode_readahead_blks-1));
|
||||
b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
|
||||
if (table > b)
|
||||
b = table;
|
||||
end = b + EXT4_SB(sb)->s_inode_readahead_blks;
|
||||
num = EXT4_INODES_PER_GROUP(sb);
|
||||
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
|
||||
EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
|
||||
num -= le16_to_cpu(gdp->bg_itable_unused);
|
||||
table += num / inodes_per_block;
|
||||
if (end > table)
|
||||
end = table;
|
||||
while (b <= end)
|
||||
sb_breadahead(sb, b++);
|
||||
}
|
||||
|
||||
/*
|
||||
* There are other valid inodes in the buffer, this inode
|
||||
* has in-inode xattrs, or we don't have this inode in memory.
|
||||
@ -3978,10 +3973,9 @@ make_io:
|
||||
submit_bh(READ_META, bh);
|
||||
wait_on_buffer(bh);
|
||||
if (!buffer_uptodate(bh)) {
|
||||
ext4_error(inode->i_sb, "ext4_get_inode_loc",
|
||||
"unable to read inode block - "
|
||||
"inode=%lu, block=%llu",
|
||||
inode->i_ino, block);
|
||||
ext4_error(sb, __func__,
|
||||
"unable to read inode block - inode=%lu, "
|
||||
"block=%llu", inode->i_ino, block);
|
||||
brelse(bh);
|
||||
return -EIO;
|
||||
}
|
||||
|
@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb)
|
||||
mark_buffer_dirty(sbi->s_sbh);
|
||||
ext4_commit_super(sb, es, 1);
|
||||
}
|
||||
if (sbi->s_proc)
|
||||
if (sbi->s_proc) {
|
||||
remove_proc_entry("inode_readahead_blks", sbi->s_proc);
|
||||
remove_proc_entry(sb->s_id, ext4_proc_root);
|
||||
}
|
||||
|
||||
for (i = 0; i < sbi->s_gdb_count; i++)
|
||||
brelse(sbi->s_group_desc[i]);
|
||||
@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
|
||||
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
|
||||
seq_puts(seq, ",data=writeback");
|
||||
|
||||
if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
|
||||
seq_printf(seq, ",inode_readahead_blks=%u",
|
||||
sbi->s_inode_readahead_blks);
|
||||
|
||||
ext4_show_quota_options(seq, sb);
|
||||
return 0;
|
||||
}
|
||||
@ -913,6 +919,7 @@ enum {
|
||||
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
|
||||
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
|
||||
Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
|
||||
Opt_inode_readahead_blks
|
||||
};
|
||||
|
||||
static match_table_t tokens = {
|
||||
@ -973,6 +980,7 @@ static match_table_t tokens = {
|
||||
{Opt_resize, "resize"},
|
||||
{Opt_delalloc, "delalloc"},
|
||||
{Opt_nodelalloc, "nodelalloc"},
|
||||
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
|
||||
{Opt_err, NULL},
|
||||
};
|
||||
|
||||
@ -1381,6 +1389,13 @@ set_qf_format:
|
||||
case Opt_delalloc:
|
||||
set_opt(sbi->s_mount_opt, DELALLOC);
|
||||
break;
|
||||
case Opt_inode_readahead_blks:
|
||||
if (match_int(&args[0], &option))
|
||||
return 0;
|
||||
if (option < 0 || option > (1 << 30))
|
||||
return 0;
|
||||
sbi->s_inode_readahead_blks = option;
|
||||
break;
|
||||
default:
|
||||
printk(KERN_ERR
|
||||
"EXT4-fs: Unrecognized mount option \"%s\" "
|
||||
@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sbi->s_mount_opt = 0;
|
||||
sbi->s_resuid = EXT4_DEF_RESUID;
|
||||
sbi->s_resgid = EXT4_DEF_RESGID;
|
||||
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
|
||||
sbi->s_sb_block = sb_block;
|
||||
|
||||
unlock_kernel();
|
||||
@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
||||
if (ext4_proc_root)
|
||||
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
|
||||
|
||||
if (sbi->s_proc)
|
||||
proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
|
||||
&ext4_ui_proc_fops,
|
||||
&sbi->s_inode_readahead_blks);
|
||||
|
||||
bgl_lock_init(&sbi->s_blockgroup_lock);
|
||||
|
||||
for (i = 0; i < db_count; i++) {
|
||||
@ -2513,8 +2534,10 @@ failed_mount2:
|
||||
brelse(sbi->s_group_desc[i]);
|
||||
kfree(sbi->s_group_desc);
|
||||
failed_mount:
|
||||
if (sbi->s_proc)
|
||||
if (sbi->s_proc) {
|
||||
remove_proc_entry("inode_readahead_blks", sbi->s_proc);
|
||||
remove_proc_entry(sb->s_id, ext4_proc_root);
|
||||
}
|
||||
#ifdef CONFIG_QUOTA
|
||||
for (i = 0; i < MAXQUOTAS; i++)
|
||||
kfree(sbi->s_qf_names[i]);
|
||||
|
Loading…
Reference in New Issue
Block a user