diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b5dd6369f82..6a5edea2d70b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,7 +177,7 @@ struct mpage_da_data { struct ext4_io_page { struct page *p_page; - int p_count; + atomic_t p_count; }; #define MAX_IO_PAGES 128 @@ -858,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern int ext4_end_io_nolock(ext4_io_end_t *io); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4d78342f3bf0..bdbe69902207 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -5647,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c58eba34724a..5b4d4e3a4d58 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4640,8 +4640,6 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a9d976..7f5451cd1d38 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,8 +32,14 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + int __init ext4_init_pageio(void) { + int i; + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); return 0; } @@ -52,24 +60,37 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; + wait_queue_head_t *wq; BUG_ON(!io); if (io->page) put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) { - if (--io->pages[i]->p_count == 0) { - struct page *page = io->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io->pages[i]); - } - } + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); io->num_io_pages = 0; - iput(io->inode); + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); kmem_cache_free(io_end_cachep, io); } @@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) io = kmem_cache_alloc(io_end_cachep, flags); if (io) { memset(io, 0, sizeof(*io)); - io->inode = igrab(inode); - BUG_ON(!io->inode); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error) struct workqueue_struct *wq; struct inode *inode; unsigned long flags; - ext4_fsblk_t err_block; int i; BUG_ON(!io_end); - inode = io_end->inode; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - err_block = bio->bi_sector >> (inode->i_blkbits - 9); bio_put(bio); - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { - pr_err("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - return; - } - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) err_block); - } - for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; @@ -236,13 +237,7 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - if (--io_end->pages[i]->p_count == 0) { - struct page *page = io_end->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io_end->pages[i]); - } + put_io_page(io_end->pages[i]); /* * If this is a partial write which happened to make @@ -254,8 +249,19 @@ static void ext4_end_bio(struct bio *bio, int error) if (!partial_write) SetPageUptodate(page); } - io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io, bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - io_end->inode = inode; io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; @@ -360,7 +365,7 @@ submit_and_retry: if ((io_end->num_io_pages == 0) || (io_end->pages[io_end->num_io_pages-1] != io_page)) { io_end->pages[io_end->num_io_pages++] = io_page; - io_page->p_count++; + atomic_inc(&io_page->p_count); } return 0; } @@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return -ENOMEM; } io_page->p_page = page; - io_page->p_count = 0; + atomic_set(&io_page->p_count, 1); get_page(page); for (bh = head = page_buffers(page), block_start = 0; @@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ - if (io_page->p_count == 0) { - put_page(page); - end_page_writeback(page); - kmem_cache_free(io_page_cachep, io_page); - } + put_io_page(io_page); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 40131b777af6..61182fe6254e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1173,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1194,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -2699,7 +2711,6 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_li_request *elr; unsigned long next_wakeup; DEFINE_WAIT(wait); - int ret; BUG_ON(NULL == eli); @@ -2723,13 +2734,12 @@ cont_thread: elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) - ret = ext4_run_li_request(elr); - - if (ret) { - ret = 0; - ext4_remove_li_request(elr); - continue; + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } } if (time_before(elr->lr_next_sched, next_wakeup)) @@ -2740,7 +2750,8 @@ cont_thread: if (freezing(current)) refrigerator(); - if (time_after_eq(jiffies, next_wakeup)) { + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } @@ -3348,6 +3359,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3446,22 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3611,10 +3637,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3622,6 +3644,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3949,13 +3975,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -4556,12 +4580,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 289010d3270b..e5e345fb2a5c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode, (unsigned long) __entry->dir, __entry->mode) ); +TRACE_EVENT(ext4_evict_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, nlink ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->nlink = inode->i_nlink; + ), + + TP_printk("dev %d,%d ino %lu nlink %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->nlink) +); + +TRACE_EVENT(ext4_drop_inode, + TP_PROTO(struct inode *inode, int drop), + + TP_ARGS(inode, drop), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, drop ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->drop = drop; + ), + + TP_printk("dev %d,%d ino %lu drop %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->drop) +); + +TRACE_EVENT(ext4_mark_inode_dirty, + TP_PROTO(struct inode *inode, unsigned long IP), + + TP_ARGS(inode, IP), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d ino %lu caller %pF", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (void *)__entry->ip) +); + +TRACE_EVENT(ext4_begin_ordered_truncate, + TP_PROTO(struct inode *inode, loff_t new_size), + + TP_ARGS(inode, new_size), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( loff_t, new_size ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->new_size = new_size; + ), + + TP_printk("dev %d,%d ino %lu new_size %lld", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, + (long long) __entry->new_size) +); + DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,