From 744602cf45ce35758b8637f76bc263c871abc6ea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Jan 2014 09:42:16 +0900 Subject: [PATCH 01/62] f2fs: update_inode_page should be done all the time In order to make fs consistency, update_inode_page should not be failed all the time. Otherwise, it is possible to lose some metadata in the inode like a link count. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +-- fs/f2fs/f2fs.h | 8 +++++++- fs/f2fs/inode.c | 23 ++++++++++++++--------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2261ccdd0b5f..20c3c648e56d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -55,8 +55,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) if (unlikely(err)) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + f2fs_stop_checkpoint(sbi); } end_page_writeback(page); dec_page_count(sbi, F2FS_WRITEBACK); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc3c558cb4f3..0f0ad3aa148a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1023,6 +1023,12 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1048,7 +1054,7 @@ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4d67ed736dca..08d69c94ab8b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -212,24 +212,29 @@ void update_inode(struct inode *inode, struct page *node_page) clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - +retry: node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return 0; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -243,13 +248,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * during the urgent cleaning time when runing out of free sections. */ f2fs_lock_op(sbi); - ret = update_inode_page(inode); + update_inode_page(inode); f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); - return ret; + return 0; } /* From 5e443818fa0b2a2845561ee25bec181424fb2889 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 12:22:14 +0900 Subject: [PATCH 02/62] f2fs: handle dirty segments inside refresh_sit_entry This patch cleans up the refresh_sit_entry to handle locate_dirty_segments. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0f0ad3aa148a..3f223aa64a57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1136,6 +1136,7 @@ void destroy_node_manager_caches(void); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7caac5f2ca9e..fba510b2f217 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -434,12 +434,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -881,17 +883,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -992,9 +992,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -1045,9 +1043,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); From abb2366c82c3d2dac3d7e9a74332137da8fc9399 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 12:25:06 +0900 Subject: [PATCH 03/62] f2fs: fix to recover xattr node block If a new xattr node page was allocated and its inode is fsynced, we should recover the xattr node page during the roll-forward process after power-cut. But, previously, f2fs didn't handle that case, resulting in kernel panic as follows reported by Tom Li. BUG: unable to handle kernel paging request at ffffc9001c861a98 IP: [] check_index_in_prev_nodes+0x86/0x2d0 [f2fs] Call Trace: [] ? printk+0x48/0x4a [] recover_fsync_data+0xdca/0xf50 [f2fs] [] f2fs_fill_super+0x92e/0x970 [f2fs] [] mount_bdev+0x1b8/0x200 [] ? f2fs_remount+0x130/0x130 [f2fs] [] f2fs_mount+0x10/0x20 [f2fs] [] mount_fs+0x3e/0x1b0 [] ? __alloc_percpu+0xb/0x10 [] vfs_kern_mount+0x6f/0x120 [] do_mount+0x259/0xa90 [] ? memdup_user+0x3d/0x80 [] ? strndup_user+0x53/0x70 [] SyS_mount+0x89/0xd0 [] system_call_fastpath+0x16/0x1b This patch adds a recovery function of xattr node pages. Reported-by: Tom Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 40 ++++++++++++++++++++++++++++++++++++++++ fs/f2fs/recovery.c | 3 +++ 3 files changed, 44 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f223aa64a57..55288d2d82e6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1121,6 +1121,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b0649b76eb4f..82f4753ef418 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1535,6 +1535,46 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + if (ofs_of_node(page) != XATTR_NODE_OFFSET) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 976a7a934db5..f1b0b8917436 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -301,6 +301,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (recover_inline_data(inode, page)) goto out; + if (recover_xattr_data(inode, page, blkaddr)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); From 1b1f559fc362f96869b7e04ef9825b1039b9a67d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Feb 2014 10:50:22 +0900 Subject: [PATCH 04/62] f2fs: remove the ugly pointer conversion This patch modifies the use of bi_private to remove pointer chasing for sbi. Previously, we had a bi_private structure, but it needs memory allocation. So this patch uses bi_private by the sbi pointer and adds a completion pointer into the sbi. This can achieve no memory allocation and nice use of the bi_private. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++---- fs/f2fs/f2fs.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 20c3c648e56d..d175ae3b612a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) static void f2fs_write_end_io(struct bio *bio, int err) { - struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; @@ -61,8 +61,10 @@ static void f2fs_write_end_io(struct bio *bio, int err) dec_page_count(sbi, F2FS_WRITEBACK); } - if (bio->bi_private) - complete(bio->bi_private); + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } if (!get_pages(sbi, F2FS_WRITEBACK) && !list_empty(&sbi->cp_wait.task_list)) @@ -85,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; return bio; } @@ -112,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) */ if (fio->type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); - io->bio->bi_private = &wait; + io->sbi->wait_io = &wait; submit_bio(rw, io->bio); wait_for_completion(&wait); } else { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 55288d2d82e6..aeff1327f9ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -398,6 +398,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ From 924a2ddbd0c2829ebca9ac899522cbb16a9b6d8c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Feb 2014 17:24:51 +0900 Subject: [PATCH 05/62] f2fs: fix the potential mismatch between dir's i_size and i_blocks This is the erroneous scenario. i_size on-disk i_size i_blocks __f2fs_add_link() 4096 4096 2 get_new_data_page 8192 4096 3 -ENOSPC = init_inode_metadata checkpoint - 4096 3 POR and reboot __f2fs_add_link() 4096 4096 3 page = get_new_data_page (page->index = 1 by NEW_ADDR) add a dentry to the page successfully f2fs_rmdir() f2fs_empty_dir() 4096 4096 3 f2fs_unlink() goes, since there is no valid dentry due to i_size = 4096. But, still there is one dentry in page->index = 1. So this patch moves the code to write dir->i_size into on-disk i_size in order to sync dir's i_size, on-disk i_size, and its i_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2b7c255bcbdf..bfcb4ae241f8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -395,9 +395,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -511,7 +508,10 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; From 491c0854b41380f48e422c00ae7e25ae4d02cecc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 4 Feb 2014 13:01:10 +0900 Subject: [PATCH 06/62] f2fs: clean up with a macro This patch adds GET_BLKOFF_FROM_SEG0 to clean up some codes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 3 +-- fs/f2fs/segment.c | 11 ++++------- fs/f2fs/segment.h | 3 +++ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f1b0b8917436..bda04a012909 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -218,8 +218,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); struct f2fs_summary sum; nid_t ino, nid; void *kaddr; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fba510b2f217..e87946a08a21 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -405,7 +405,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); @@ -987,8 +987,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); @@ -1026,8 +1025,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ @@ -1035,8 +1033,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5731682d7516..4024546b6361 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -57,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ From f6517cfc84246b2606fd631730846c648ee0d455 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jan 2014 14:54:07 +0900 Subject: [PATCH 07/62] f2fs: fix a build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch modifies flow a little bit to avoid the following build warnings. src/fs/f2fs/recovery.c: In function ‘check_index_in_prev_nodes’: src/fs/f2fs/recovery.c:288:51: warning: ‘sum...ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] src/fs/f2fs/recovery.c:260:23: warning: ‘sum.nid’ may be used uninitialized in this function [-Wmaybe-uninitialized] Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index bda04a012909..72adbbfdb3e5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -219,11 +219,11 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct page *sum_page, *node_page; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; unsigned int offset; block_t bidx; int i; @@ -237,18 +237,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { From bd859c6598dd2b73c517b3a36ecc5dd387eb1eb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Feb 2014 11:16:39 +0900 Subject: [PATCH 08/62] f2fs: fix to truncate dentry pages in the error case When a new directory is allocated, if an error is occurred, we should truncate preallocated dentry pages too. This bug was reported by Andrey Tsyvarev after a while as follows. mkdir()-> f2fs_add_link()-> init_inode_metadata()-> f2fs_init_acl()-> f2fs_get_acl()-> f2fs_getxattr()-> read_all_xattrs() fails. Also there was a BUG_ON triggered after the fault in mkdir()-> f2fs_add_link()-> init_inode_metadata()-> remove_inode_page() -> f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); But, previous patch wasn't perfect to resolve that bug, so the following bug report was also submitted. kernel BUG at fs/f2fs/inode.c:274! Call Trace: [] evict+0xa3/0x1a0 [] iput+0xf5/0x180 [] f2fs_mkdir+0xf3/0x150 [f2fs] [] vfs_mkdir+0xb7/0x160 [] SyS_mkdir+0x5f/0xc0 [] system_call_fastpath+0x16/0x1b Finally, this patch resolves all the issues like below. If an error is occurred after make_empty_dir(), 1. truncate_inode_pages() The make_bad_inode() prior to iput() will change i_mode to S_IFREG, which means that f2fs will not decrement fi->dirty_dents during f2fs_evict_inode. But, by calling it here, we can do that. 2. truncate_blocks() Preallocated dentry pages are trucated here to sync i_blocks. 3. remove_dirty_dir_inode() Remove this directory inode from the list. Reported-and-Tested-by: Andrey Tsyvarev Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bfcb4ae241f8..5bbf94c31180 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -372,6 +372,10 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); error: remove_inode_page(inode); return ERR_PTR(err); From 203681f65b07055259bd475a6281136615b4e9a4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Feb 2014 13:03:57 +0900 Subject: [PATCH 09/62] f2fs: fix f2fs_write_meta_page at no checkpoint status If f2fs entered errorneous checkpoint status, it should skip writing meta pages instead of redirtying the pages out. Otherwise, it cannot unmount the partition even though f2fs is under read-only status. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 29 +++++++++++++++++++++-------- fs/f2fs/gc.c | 2 ++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 293d0486a40f..8f5dff1989a8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -81,17 +81,18 @@ static int f2fs_write_meta_page(struct page *page, struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(sbi->por_doing)) goto redirty_out; - if (wbc->for_reclaim) goto redirty_out; - wait_on_page_writeback(page); + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + wait_on_page_writeback(page); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -148,10 +149,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - f2fs_bug_on(page->mapping != mapping); - f2fs_bug_on(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea0371e854b4..b0f57628fe55 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -701,6 +701,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; From 1fe54f9dd3acfaa3ed4e1d1e3278fd0f1d1e98cd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Feb 2014 10:00:06 +0900 Subject: [PATCH 10/62] f2fs: clean up redundant function call This patch integrates inode_[inc|dec]_dirty_dents with inc_page_count to remove redundant calls. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 - fs/f2fs/data.c | 11 ++--------- fs/f2fs/dir.c | 4 ++-- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/gc.c | 7 +------ 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8f5dff1989a8..427dd55cfd5a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -508,7 +508,6 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) if (__add_dirty_inode(inode, new)) kmem_cache_free(inode_entry_slab, new); - inc_page_count(sbi, F2FS_DIRTY_DENTS); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d175ae3b612a..b401be71ecbd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -799,10 +799,7 @@ static int f2fs_write_data_page(struct page *page, */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + inode_dec_dirty_dents(inode); goto out; } @@ -815,7 +812,6 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); } else { @@ -1033,11 +1029,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5bbf94c31180..d5a2c9ed9aa7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -532,7 +532,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; @@ -555,6 +554,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -577,7 +578,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aeff1327f9ed..4841d1225ea0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -662,6 +662,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -672,6 +673,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0f57628fe55..b161db4a96a4 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); From 3375f696bd9cfdfd385e2460a9cf021d8ef01eab Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Jan 2014 10:29:26 +0800 Subject: [PATCH 11/62] f2fs: use inode mutex to keep atomicity of f2fs_falloc Previously without protection of inode mutex, f2fs_falloc and other data correlated operations will interfere with each other. So let's use inode mutex to keep atomicity of f2fs_falloc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef53a6ed..00f937ec4794 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -560,6 +560,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) ret = punch_hole(inode, offset, len); else @@ -569,6 +571,9 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } From 662befda25fb16d7164633c39e9e20aeac5107d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Feb 2014 16:11:53 +0800 Subject: [PATCH 12/62] f2fs: introduce ra_meta_pages to readahead CP/NAT/SIT pages This patch help us to cleanup the readahead code by merging ra_{sit,nat}_pages function into ra_meta_pages. Additionally the new function is used to readahead cp block in recover_orphan_inodes. Change log from v1: o fix a deadloop bug pointed by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 78 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 10 ++++++ fs/f2fs/node.c | 38 +-------------------- fs/f2fs/segment.c | 43 +----------------------- 4 files changed, 90 insertions(+), 79 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 427dd55cfd5a..deb60356f7cf 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -75,6 +75,82 @@ out: return page; } +inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_CP: + /* get cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + mark_page_accessed(page); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { @@ -298,6 +374,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1; orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4841d1225ea0..817eccced9fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -88,6 +88,15 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT readahead + */ +enum { + META_CP, + META_NAT, + META_SIT +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -1176,6 +1185,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 82f4753ef418..7689f9105dc1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -82,42 +82,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - pgoff_t index; - int i; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (unlikely(nid >= nm_i->max_nid)) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - f2fs_submit_page_mbio(sbi, page, index, &fio); - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - f2fs_submit_merged_bio(sbi, META, READ); -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -1413,7 +1377,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e87946a08a21..fbb41ba818fd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1576,47 +1576,6 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct page *page; - block_t blk_addr, prev_blk_addr = 0; - int sit_blk_cnt = SIT_BLK_CNT(sbi); - int blkno = start; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { - - blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - - if (blkno != start && prev_blk_addr + 1 != blk_addr) - break; - prev_blk_addr = blk_addr; -repeat: - page = grab_cache_page(mapping, blk_addr); - if (!page) { - cond_resched(); - goto repeat; - } - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - - f2fs_submit_merged_bio(sbi, META, READ); - return blkno - start; -} - static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); @@ -1628,7 +1587,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); do { - readed = ra_sit_pages(sbi, start_blk, nrpages); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; From 942e0be6219cc80384eb961feb963cab275bcbbf Mon Sep 17 00:00:00 2001 From: Changman Lee Date: Thu, 13 Feb 2014 15:12:29 +0900 Subject: [PATCH 13/62] f2fs: show counts of checkpoint in status This patch shows the counts of checkpoint in f2fs' status. Signed-off-by: Changman Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index deb60356f7cf..757b77b7118e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -914,6 +914,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + stat_inc_cp_count(sbi->stat_info); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 3de9d20d0c14..46a12e46179a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -236,6 +236,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 817eccced9fe..91f4c5e7b6a2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1255,7 +1255,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1272,6 +1272,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } +#define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) @@ -1326,6 +1327,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) #define stat_inc_dirty_dir(sbi) From b63da15e8b475245026bdf2096853683f189706b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Feb 2014 12:44:20 +0900 Subject: [PATCH 14/62] f2fs: fix the calculation of max_nids Total nids that f2fs can use should not include 0, nid for node inode, and nid for meta inode. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7689f9105dc1..d452185c5eaa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1811,7 +1811,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; From 8618b881e9fd0e1817ff5cb7befadd3240d54830 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Feb 2014 19:29:27 +0900 Subject: [PATCH 15/62] f2fs: fix not to write data pages on the page reclaiming path Even if f2fs_write_data_page is called by the page reclaiming path, we should not write the page to provide enough free segments for the worst case scenario. Otherwise, f2fs can face with no free segment while gc is conducted, resulting in: ------------[ cut here ]------------ kernel BUG at /home/zeus/f2fs_test/src/fs/f2fs/segment.c:565! RIP: 0010:[] [] new_curseg+0x331/0x340 [f2fs] Call Trace: allocate_segment_by_default+0x204/0x280 [f2fs] allocate_data_block+0x108/0x210 [f2fs] write_data_page+0x8a/0xc0 [f2fs] do_write_data_page+0xe1/0x2a0 [f2fs] move_data_page+0x8a/0xf0 [f2fs] f2fs_gc+0x446/0x970 [f2fs] f2fs_balance_fs+0xb6/0xd0 [f2fs] f2fs_write_begin+0x50/0x350 [f2fs] ? unlock_page+0x27/0x30 ? unlock_page+0x27/0x30 generic_file_buffered_write+0x10a/0x280 ? file_update_time+0xa3/0xf0 __generic_file_aio_write+0x1c8/0x3d0 ? generic_file_aio_write+0x52/0xb0 ? generic_file_aio_write+0x52/0xb0 generic_file_aio_write+0x65/0xb0 do_sync_write+0x5a/0x90 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xa0 system_call_fastpath+0x16/0x1b Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b401be71ecbd..93d80ea4674b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -805,38 +805,30 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (unlikely(sbi->por_doing)) { - err = AOP_WRITEPAGE_ACTIVATE; + if (unlikely(sbi->por_doing)) goto redirty_out; - } /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); - } else { - f2fs_lock_op(sbi); - - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { - err = f2fs_write_inline_data(inode, page, offset); - f2fs_unlock_op(sbi); - goto out; - } else { - err = do_write_data_page(page, &fio); - } - - f2fs_unlock_op(sbi); - need_balance_fs = true; + goto done; } - if (err == -ENOENT) - goto out; - else if (err) + + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; - if (wbc->for_reclaim) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - need_balance_fs = false; - } + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); out: @@ -848,7 +840,7 @@ out: redirty_out: wbc->pages_skipped++; set_page_dirty(page); - return err; + return AOP_WRITEPAGE_ACTIVATE; } #define MAX_DESIRED_PAGES_WP 4096 From 6437d1b0adb46f29aafcbf10950a89211028ca09 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Feb 2014 18:23:32 +0900 Subject: [PATCH 16/62] f2fs: fix to do build_stat prior to the recovery procedure At the end of the recovery procedure, write_checkpoint is called and updates the cp count which is managed by f2fs stat. But, previously build_stat() is called after the recovery procedure, which results in: BUG: unable to handle kernel NULL pointer dereference at 000000000000012c IP: [] write_checkpoint+0x720/0xbc0 [f2fs] Call Trace: [] ? mark_held_locks+0x74/0x140 [] ? __init_waitqueue_head+0x60/0x60 [] recover_fsync_data+0x656/0xf20 [f2fs] [] ? security_d_instantiate+0x1b/0x30 [] f2fs_fill_super+0x94d/0xa00 [f2fs] [] mount_bdev+0x1a5/0x1f0 [] ? __get_free_pages+0xe/0x40 [] ? f2fs_remount+0x130/0x130 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x43/0x1b0 [] vfs_kern_mount+0x74/0x160 [] ? __get_fs_type+0x51/0x60 [] do_mount+0x237/0xb50 [] ? copy_mount_options+0x3a/0x170 So, this patche changes the order of recovery_fsync_data() and f2fs_build_stats(). Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a85f83abd53..475560e5ee71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -989,28 +989,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); - } - - /* - * If filesystem is not mounted as read-only then - * do start the gc_thread. - */ - if (!(sb->s_flags & MS_RDONLY)) { - /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); - if (err) - goto free_gc; - } - err = f2fs_build_stats(sbi); if (err) - goto free_gc; + goto free_root_inode; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1032,17 +1013,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); -free_gc: - stop_gc_thread(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; From fffc2a00fc01b781c1e3b9541e3e0f270c50ce90 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Feb 2014 13:17:22 +0900 Subject: [PATCH 17/62] f2fs: fix to mark the checkpointed nat entry correctly The nat cache entry maintains a status whether it is checkpointed or not. So, if a new cache entry is loaded from the last checkpoint, nat_entry->checkpointed should be true. If the cache entry is modified as being dirty, nat_entry->checkpoint should be false. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 +------ fs/f2fs/node.h | 10 ++++++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d452185c5eaa..a070b1457d70 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -128,6 +128,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -149,7 +150,6 @@ retry: nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); nat_set_ino(e, le32_to_cpu(ne->ino)); nat_set_version(e, ne->version); - e->checkpointed = true; } write_unlock(&nm_i->nat_tree_lock); } @@ -169,7 +169,6 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* @@ -181,9 +180,6 @@ retry: f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && @@ -1787,7 +1783,6 @@ flush_now: } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c4c79885c993..4dea719766ef 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -58,9 +58,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0); #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0); #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, From f978f5a0616d18f303d9c8f51c293a03bc09dbaf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 21 Feb 2014 18:08:29 +0800 Subject: [PATCH 18/62] f2fs: introduce help macro on_build_free_nids() Introduce help macro on_build_free_nids() which just uses build_lock to judge whether the building free nid is going, so that we can remove the on_build_free_nids field from f2fs_sb_info. Signed-off-by: Gu Zheng [Jaegeuk Kim: remove an unnecessary white line removal] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/node.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91f4c5e7b6a2..c56e67b468da 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -417,7 +417,6 @@ struct f2fs_sb_info { struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ - bool on_build_free_nids; /* build_free_nids is doing */ wait_queue_head_t cp_wait; /* for orphan inode management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a070b1457d70..431bcb42cdd0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -21,6 +21,8 @@ #include "segment.h" #include +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -1422,7 +1424,7 @@ retry: spin_lock(&nm_i->free_nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); list_for_each(this, &nm_i->free_nid_list) { i = list_entry(this, struct free_nid, list); @@ -1441,9 +1443,7 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = true; build_free_nids(sbi); - sbi->on_build_free_nids = false; mutex_unlock(&nm_i->build_lock); goto retry; } From 8a7ed66aaf8ee56b0a6beee4d02e10af5a9e38b2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Feb 2014 14:29:35 +0900 Subject: [PATCH 19/62] f2fs: introduce a radix_tree for the free_nid list This patch introduces a radix tree for the list of free_nids, which enhances the performance on free nid management. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 36 +++++++++++++++++------------------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c56e67b468da..11fd8bec670b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 431bcb42cdd0..1f9cf2148816 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1269,21 +1269,17 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; - } - return NULL; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); kmem_cache_free(free_nid_slab, i); } @@ -1304,7 +1300,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) /* do not add allocated nids */ read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1316,7 +1313,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1331,9 +1328,9 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1457,9 +1454,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1475,10 +1472,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) return; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); } else { i->state = NID_NEW; nm_i->fcnt++; @@ -1812,6 +1809,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1865,7 +1863,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { f2fs_bug_on(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; } f2fs_bug_on(nm_i->fcnt); From 8b8343fa9d503894ece57acbe46cb36883646685 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Feb 2014 13:00:13 +0900 Subject: [PATCH 20/62] f2fs: implement a lock-free stat_show The stat_show is just to show the current status of f2fs. So, we can remove all the there-in locks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 --- fs/f2fs/f2fs.h | 18 +++--------------- fs/f2fs/segment.h | 27 +++------------------------ 3 files changed, 6 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 46a12e46179a..b7111c44a918 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11fd8bec670b..4beedccc28a0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -704,11 +704,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -804,11 +800,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) @@ -829,11 +821,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 4024546b6361..c3d5e3689ffc 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -380,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -409,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) From 5d0c667121bfc8be76d1580f485bddbe73465d1a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 13:57:53 +0900 Subject: [PATCH 21/62] f2fs: remove costly bit operations for f2fs_find_entry It turns out that a bit operation like find_next_bit is not always fast enough for f2fs_find_entry. Instead, it is pretty much simple and fast to traverse each dentries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5a2c9ed9aa7..c3ea8f8cc80a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -93,16 +93,20 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - + if (!test_bit_le(bit_pos, &dentry_blk->dentry_bitmap)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, &dentry_blk->dentry_bitmap)) + max_len++; + bit_pos++; + continue; + } if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -110,20 +114,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } From 3843154598a00408f4214a68bd536fdf27b1df10 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 18:20:00 +0900 Subject: [PATCH 22/62] f2fs: introduce large directory support This patch introduces an i_dir_level field to support large directory. Previously, f2fs maintains multi-level hash tables to find a dentry quickly from a bunch of chiild dentries in a directory, and the hash tables consist of the following tree structure as below. In Documentation/filesystems/f2fs.txt, ---------------------- A : bucket B : block N : MAX_DIR_HASH_DEPTH ---------------------- level #0 | A(2B) | level #1 | A(2B) - A(2B) | level #2 | A(2B) - A(2B) - A(2B) - A(2B) . | . . . . level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) But, if we can guess that a directory will handle a number of child files, we don't need to traverse the tree from level #0 to #N all the time. Since the lower level tables contain relatively small number of dentries, the miss ratio of the target dentry is likely to be high. In order to avoid that, we can configure the hash tables sparsely from level #0 like this. level #0 | A(2B) - A(2B) - A(2B) - A(2B) level #1 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) . | . . . . level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) With this structure, we can skip the ineffective tree searches in lower level hash tables. This patch adds just a facility for this by introducing i_dir_level in f2fs_inode. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 6 ++++-- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 2 ++ include/linux/f2fs_fs.h | 2 +- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b8d284975f0f..8eb06b0a7d2b 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -444,9 +444,11 @@ The number of blocks and buckets are determined by, # of blocks in level #n = | `- 4, Otherwise - ,- 2^n, if n < MAX_DIR_HASH_DEPTH / 2, + ,- 2^ (n + dir_level), + | if n < MAX_DIR_HASH_DEPTH / 2, # of buckets in level #n = | - `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), Otherwise + `- 2^((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1), + Otherwise When F2FS finds a file name in a directory, at first a hash value of the file name is calculated. Then, F2FS scans the hash table in level #0 to find the diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c3ea8f8cc80a..582fa00f3597 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); } static unsigned int bucket_blocks(unsigned int level) @@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -143,10 +144,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -467,10 +469,11 @@ start: if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4beedccc28a0..a82691674918 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -200,6 +200,7 @@ struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 08d69c94ab8b..d518e37df3a7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); @@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index da74d878dc4f..df53e1753a76 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -183,7 +183,7 @@ struct f2fs_inode { __le32 i_pino; /* parent inode number */ __le32 i_namelen; /* file name length */ __u8 i_name[F2FS_NAME_LEN]; /* file name for SPOR */ - __u8 i_reserved2; /* for backward compatibility */ + __u8 i_dir_level; /* dentry_level for large dir */ struct f2fs_extent i_ext; /* caching a largest extent */ From ab9fa662e4867455f44f4de96d29a7f09cf292c6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Feb 2014 20:09:05 +0900 Subject: [PATCH 23/62] f2fs: add an sysfs entry to control the directory level This patch adds an sysfs entry to control dir_level used by the large directory. The description of this entry is: dir_level This parameter controls the directory level to support large directory. If a directory has a number of files, it can reduce the file lookup latency by increasing this dir_level value. Otherwise, it needs to decrease this value to reduce the space overhead. The default value is 0. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 7 +++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8eb06b0a7d2b..803784e1e8ef 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -195,6 +195,13 @@ Files in /sys/fs/f2fs/ cleaning operations. The default value is 4096 which covers 8GB block address range. + dir_level This parameter controls the directory level to + support large directory. If a directory has a + number of files, it can reduce the file lookup + latency by increasing this dir_level value. + Otherwise, it needs to decrease this value to + reduce the space overhead. The default value is 0. + ================================================================================ USAGE ================================================================================ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a82691674918..6f88191f2a34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -196,6 +196,8 @@ struct extent_info { #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define DEF_DIR_LEVEL 0 + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ @@ -447,6 +449,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 475560e5ee71..1bd915362154 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -184,6 +184,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -196,6 +197,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), NULL, }; @@ -359,6 +361,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) if (test_opt(F2FS_SB(sb), INLINE_XATTR)) set_inode_flag(fi, FI_INLINE_XATTR); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -785,6 +790,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } /* From 81c1a0f13e6306a76fc3743b8504085d96659a5f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Feb 2014 19:12:24 +0800 Subject: [PATCH 24/62] f2fs: readahead contiguous SSA blocks for f2fs_gc If there are multi segments in one section, we will read those SSA blocks which have contiguous address one by one in f2fs_gc. It may lost performance, let's read ahead SSA blocks by merge multi read request. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++-- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 5 +++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 757b77b7118e..c8516ee24126 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -82,6 +82,7 @@ inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; case META_SIT: return SIT_BLK_CNT(sbi); + case META_SSA: case META_CP: return 0; default: @@ -90,7 +91,7 @@ inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) } /* - * Readahead CP/NAT/SIT pages + * Readahead CP/NAT/SIT/SSA pages */ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) { @@ -125,8 +126,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) goto out; prev_blk_addr = blk_addr; break; + case META_SSA: case META_CP: - /* get cp block addr */ + /* get ssa/cp block addr */ blk_addr = blkno; break; default: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f88191f2a34..bd6666e1bf2f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -89,12 +89,13 @@ enum { }; /* - * For CP/NAT/SIT readahead + * For CP/NAT/SIT/SSA readahead */ enum { META_CP, META_NAT, - META_SIT + META_SIT, + META_SSA }; /* for the list of orphan inodes */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b161db4a96a4..d94acbc3d928 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -708,6 +708,11 @@ gc_more: goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); From 695fd1ed3bcaae9fc032cbe47f0fe9a934bf1717 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Feb 2014 19:52:21 +0800 Subject: [PATCH 25/62] f2fs: use existing macro to clean up some codes This patch use existing macro F2FS_INODE/NEXT_FREE_BLKADDR to clean up some codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++---- fs/f2fs/recovery.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bd6666e1bf2f..8b2cb82f852b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1000,8 +1000,7 @@ static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) static inline void *inline_xattr_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1021,8 +1020,7 @@ static inline int f2fs_has_inline_data(struct inode *inode) static inline void *inline_data_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[1]); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 72adbbfdb3e5..aef77681e10b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -136,7 +136,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); From 9cf3c3898a274ca637b88ad01b0830550ee2d318 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Feb 2014 10:12:05 +0800 Subject: [PATCH 26/62] f2fs: fix dirty page accounting when redirty We should de-account dirty counters for page when redirty in ->writepage(). Wu Fengguang described in 'commit 971767caf632190f77a40b4011c19948232eed75': "writeback: fix dirtied pages accounting on redirty De-account the accumulative dirty counters on page redirty. Page redirties (very common in ext4) will introduce mismatch between counters (a) and (b) a) NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied b) NR_WRITTEN, BDI_WRITTEN This will introduce systematic errors in balanced_rate and result in dirty page position errors (ie. the dirty pages are no longer balanced around the global/bdi setpoints)." Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 1 + fs/f2fs/node.c | 1 + 3 files changed, 3 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c8516ee24126..f069249011b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -178,6 +178,7 @@ no_write: redirty_out: dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 93d80ea4674b..101b4cd4170d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -839,6 +839,7 @@ out: redirty_out: wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1f9cf2148816..8c1411060e7e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1193,6 +1193,7 @@ static int f2fs_write_node_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_NODES); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } From c81bf1c84f6924678d088d68e131ff1f4d2d9002 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Mar 2014 11:28:40 +0900 Subject: [PATCH 27/62] f2fs: fix to write node pages with WRITE_SYNC This patch fixes performance regression of dbench reported by Alex . This issue was revealed by Phoronix tests results: http://www.phoronix.com/scan.php?page=article&item=linux_314_ssdfs&num=2 It turns out that we need to assign WRITE_SYNC to the node writes, if fsync is triggered. The performance numbers are like below, which is measured by Alex. 1. 355MB/s ext4 2. 225MB/s f2fs : WRITE for node writes 3. 525MB/s f2fs : WRITE_SYNC for node writes Reported-And-Tested-by: Alex . Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 00f937ec4794..a4cc1d6bdd84 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -115,7 +115,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; From 20f70751c6b4ac5055be9a0d8a3d3189a81afc5a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 5 Mar 2014 10:48:25 +0900 Subject: [PATCH 28/62] f2fs: fix wrong kernel coding style This patch includes a simple fix to adjust coding style. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 582fa00f3597..f3a80ce9ddf5 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -96,18 +96,19 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); + const void *dentry_bits = &dentry_blk->dentry_bitmap; int max_len = 0; while (bit_pos < NR_DENTRY_IN_BLOCK) { - de = &dentry_blk->dentry[bit_pos]; - if (!test_bit_le(bit_pos, &dentry_blk->dentry_bitmap)) { + if (!test_bit_le(bit_pos, dentry_bits)) { if (bit_pos == 0) max_len = 1; - else if (!test_bit_le(bit_pos - 1, &dentry_blk->dentry_bitmap)) + else if (!test_bit_le(bit_pos - 1, dentry_bits)) max_len++; bit_pos++; continue; } + de = &dentry_blk->dentry[bit_pos]; if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { From b6ce391e615175029cb8496f03afc9905e0957cc Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:24 +0800 Subject: [PATCH 29/62] f2fs: update start nid only once each circle Integrated a couple of minor changes for better readability suggested by Chao Yu. Signed-off-by: Gu Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8c1411060e7e..77b61893fc8d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1875,11 +1875,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); From e8512d2e0c4eb38cd78b1499bb08d7d8eea6c723 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:28 +0800 Subject: [PATCH 30/62] f2fs: remove the unused ctor argument of f2fs_kmem_cache_create() Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/gc.c | 2 +- fs/f2fs/node.c | 4 ++-- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f069249011b2..911b6f9e9f7b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -939,11 +939,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi) int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); + sizeof(struct orphan_inode_entry)); if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); + sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8b2cb82f852b..f845e9282f5e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -852,9 +852,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d94acbc3d928..b90dbe55403a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -742,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 77b61893fc8d..12c9ded767d9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1890,12 +1890,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index aef77681e10b..03b28ec4c2dc 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -436,7 +436,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); + sizeof(struct fsync_inode_entry)); if (!fsync_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fbb41ba818fd..199c964680c5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1878,7 +1878,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) int __init create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", - sizeof(struct discard_entry), NULL); + sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1bd915362154..72df734764e7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1089,7 +1089,7 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); + sizeof(struct f2fs_inode_info)); if (!f2fs_inode_cachep) return -ENOMEM; return 0; From 46c04366bbfd112a74dcfebbe41c9bf3f496ea75 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:33 +0800 Subject: [PATCH 31/62] f2fs: format segment_info's show for better legibility The original segment_info's show is a bit out-of-format: [root@guz Demoes]# cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...... 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 [root@guz Demoes]# so we fix it here for better legibility. [root@guz Demoes]# cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...... 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 [root@guz Demoes]# Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 72df734764e7..6e4851ce029b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -546,11 +546,12 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) for (i = 0; i < total_segs; i++) { seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); - if (i != 0 && (i % 10) == 0) - seq_puts(seq, "\n"); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); else - seq_puts(seq, " "); + seq_putc(seq, ' '); } + return 0; } From d653788a43475eb3cdfcfaa60fb53451878944cf Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 7 Mar 2014 18:43:36 +0800 Subject: [PATCH 32/62] f2fs: optimize restore_node_summary slightly Previously, we ra_sum_pages to pre-read contiguous pages as more as possible, and if we fail to alloc more pages, an ENOMEM error will be reported upstream, even though we have alloced some pages yet. In fact, we can use the available pages to do the job partly, and continue the rest in the following circle. Only reporting ENOMEM upstream if we really can not alloc any available page. And another fix is ignoring dealing with the following pages if an EIO occurs when reading page from page_list. Signed-off-by: Gu Zheng Reviewed-by: Chao Yu [Jaegeuk Kim: modify the flow for better neat code] Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 28 ++++++++++++---------------- fs/f2fs/segment.c | 7 +++++-- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 12c9ded767d9..c415cec041b7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1588,15 +1588,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, for (; page_idx < start + nrpages; page_idx++) { /* alloc temporal page for read node summary info*/ page = alloc_page(GFP_F2FS_ZERO); - if (!page) { - struct page *tmp; - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - unlock_page(page); - __free_pages(page, 0); - } - return -ENOMEM; - } + if (!page) + break; lock_page(page); page->index = page_idx; @@ -1607,7 +1600,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, f2fs_submit_page_mbio(sbi, page, page->index, &fio); f2fs_submit_merged_bio(sbi, META, READ); - return 0; + + return page_idx - start; } int restore_node_summary(struct f2fs_sb_info *sbi, @@ -1626,15 +1620,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi, addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); /* read ahead node pages */ - err = ra_sum_pages(sbi, &page_list, addr, nrpages); - if (err) - return err; + nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (!nrpages) + return -ENOMEM; list_for_each_entry_safe(page, tmp, &page_list, lru) { + if (err) + goto skip; lock_page(page); if (unlikely(!PageUptodate(page))) { @@ -1646,9 +1642,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry->ofs_in_node = 0; sum_entry++; } - - list_del(&page->lru); unlock_page(page); +skip: + list_del(&page->lru); __free_pages(page, 0); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 199c964680c5..b3f84318b7ed 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1160,9 +1160,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } From 28cdce0459ccea71ea734d7903d39910d1c6a05d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Mar 2014 13:37:38 +0800 Subject: [PATCH 33/62] f2fs: recover inline xattr data in roll-forward process Previously we do not recover inline xattr data of inode after power-cut, so inline xattr data may be lost. We should recover the data during the roll-forward process. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c415cec041b7..e72b2585de68 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1493,6 +1493,37 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } +void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -1500,6 +1531,8 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) nid_t new_xnid = nid_of_node(page); struct node_info ni; + recover_inline_xattr(inode, page); + if (ofs_of_node(page) != XATTR_NODE_OFFSET) return false; From 987c7c31123fd36c1f792ff53ff131378475f5c8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Mar 2014 15:59:03 +0800 Subject: [PATCH 34/62] f2fs: introduce f2fs_has_inline_xattr for better readability This patch introduces a help function f2fs_has_inline_xattr for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 9 +++++++-- fs/f2fs/node.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f845e9282f5e..1f87a04f1441 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -991,9 +991,14 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DATA; } +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) { - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(&fi->vfs_inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1007,7 +1012,7 @@ static inline void *inline_xattr_addr(struct page *page) static inline int inline_xattr_size(struct inode *inode) { - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(inode)) return F2FS_INLINE_XATTR_ADDRS << 2; else return 0; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e72b2585de68..c618fad3e6c3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1501,7 +1501,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - if (!is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (!f2fs_has_inline_xattr(inode)) return; if (!IS_INODE(page)) From 910bb12d29cc64144506333bfeaeeee9715c3872 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Mar 2014 17:08:36 +0800 Subject: [PATCH 35/62] f2fs: check upper bound of ino value in f2fs_nfs_get_inode Upper bound checking of ino should be added to f2fs_nfs_get_inode, so unneeded process before do_read_inode in f2fs_iget could be avoided when ino is invalid. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6e4851ce029b..3a51d7a4a6c9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -644,6 +644,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); + if (unlikely(ino >= NM_I(sbi)->max_nid)) + return ERR_PTR(-ESTALE); /* * f2fs_iget isn't quite right if the inode is currently unallocated! From 90aa6dc9b9dd9b3ee832bb6e91e2d8ba8ebb93b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 10:31:06 +0800 Subject: [PATCH 36/62] f2fs: print type for each segment in segment_info's show The original segment_info's show looks out-of-format: cat /proc/fs/f2fs/loop0/segment_info 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 512 512 512 512 512 512 512 512 0 0 512 348 0 263 0 0 512 0 0 512 512 512 512 0 512 512 512 512 512 512 512 512 512 511 328 512 512 512 512 512 512 512 512 512 512 512 512 512 0 0 175 Let's fix this and show type for each segment. cat /proc/fs/f2fs/loop0/segment_info format: segment_type|valid_blocks segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN) 0 2|0 1|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 10 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 20 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 30 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 40 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 50 3|0 3|0 3|0 3|0 3|0 3|0 3|0 0|0 3|0 3|0 60 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|0 3|512 70 3|512 3|512 3|512 3|512 3|512 3|512 3|512 3|0 3|0 3|512 80 3|0 3|0 3|0 3|0 3|0 3|512 3|0 3|0 3|512 3|512 90 3|512 0|512 3|274 0|512 0|512 0|512 0|512 0|512 0|512 3|512 100 3|512 0|512 3|511 0|328 3|512 0|512 0|512 3|512 0|512 0|512 110 0|512 0|512 0|512 0|512 0|512 0|512 0|512 5|0 4|0 3|512 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a51d7a4a6c9..057a3efb2487 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -544,8 +544,16 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) le32_to_cpu(sbi->raw_super->segment_count_main); int i; + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + for (i = 0; i < total_segs; i++) { - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else From 4bc8e9bcf50103216a7a316ab66b9bb8e81baa27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 16:35:06 +0800 Subject: [PATCH 37/62] f2fs: introduce f2fs_has_xattr_block for better readability This patch introduces a help function f2fs_has_xattr_block for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/node.c | 4 ++-- fs/f2fs/node.h | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f87a04f1441..292cc3c25b28 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -637,6 +637,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c618fad3e6c3..3e36240d81c1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -836,7 +836,7 @@ struct page *new_node_page(struct dnode_of_data *dn, SetPageUptodate(page); set_page_dirty(page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; @@ -1533,7 +1533,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_inline_xattr(inode, page); - if (ofs_of_node(page) != XATTR_NODE_OFFSET) + if (!f2fs_has_xattr_block(ofs_of_node(page))) return false; /* 1: invalidate the previous xattr nid */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4dea719766ef..ee6d28684ee8 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -242,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) return false; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || From e4fc5fbfc9e285356be7e5208bb1a2fa377b2656 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Mar 2014 16:36:24 +0800 Subject: [PATCH 38/62] f2fs: avoid to return incorrect errno of read_normal_summaries We should return error number of read_normal_summaries instead of -EINVAL when read_normal_summaries failed. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3f84318b7ed..6c5a4f0218ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1186,6 +1186,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1194,9 +1195,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } From 04c0938844695ab97b79a477a9f57748fe97d2f5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Mar 2014 09:07:59 +0800 Subject: [PATCH 39/62] f2fs: fix incorrect parsing with option string Previously 'background_gc={on***,off***}' is being parsed as correct option, with this patch we cloud fix the trivial bug in mount process. Change log from v1: o need to check length of parameter suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 057a3efb2487..dbe402b1a4b7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -258,9 +258,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (!strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) clear_opt(sbi, BG_GC); else { kfree(name); From f8b2c1f940dca2843fe13b55ba5868bac8040551 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 12:33:06 +0900 Subject: [PATCH 40/62] f2fs: introduce get_dirty_dents for readability The get_dirty_dents gives us the number of dirty dentry pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/inode.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 911b6f9e9f7b..4c0e98ddf3db 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -619,7 +619,7 @@ void remove_dirty_dir_inode(struct inode *inode) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + if (get_dirty_dents(inode)) { spin_unlock(&sbi->dir_inode_lock); return; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 292cc3c25b28..59fac1a9f0c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -704,6 +704,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d518e37df3a7..0d8e4a2302b7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -273,7 +273,7 @@ void f2fs_evict_inode(struct inode *inode) inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) From 87d6f890944d092c4ef5b84053f0d0d5d8137b0b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 12:40:49 +0900 Subject: [PATCH 41/62] f2fs: avoid small data writes by skipping writepages This patch introduces nr_pages_to_skip(sbi, type) to determine writepages can be skipped. The dentry, node, and meta pages can be conrolled by F2FS without breaking the FS consistency. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++++ fs/f2fs/node.c | 8 +------- fs/f2fs/segment.h | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4c0e98ddf3db..1f52b70ff9d1 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -187,7 +187,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int nrpages = nr_pages_to_skip(sbi, META); long written; if (wbc->for_kupdate) @@ -682,7 +682,7 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 101b4cd4170d..e3b7cfa17b99 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -868,6 +868,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!mapping->a_ops->writepage) return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) + return 0; + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; excess_nrtw = desired_nrtw - wbc->nr_to_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3e36240d81c1..cb514f1896ab 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1198,12 +1198,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. - */ -#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1214,7 +1208,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) return 0; /* if mounting is failed, skip writing node pages */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index c3d5e3689ffc..bbd976100d14 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -664,3 +664,22 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) struct request_queue *q = bdev_get_queue(bdev); return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); } + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} From d3baf95da5b0bce9fe980eeff6140817d63fabdf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:43:05 +0900 Subject: [PATCH 42/62] f2fs: increase pages_skipped when skipping writepages This patch increases pages_skipped when skipping writepages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- fs/f2fs/data.c | 6 +++++- fs/f2fs/node.c | 6 +++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1f52b70ff9d1..aef32f36e2f3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -190,12 +190,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, int nrpages = nr_pages_to_skip(sbi, META); long written; - if (wbc->for_kupdate) - return 0; - /* collect a number of dirty meta pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) - return 0; + if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nrpages) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); @@ -203,6 +200,10 @@ static int f2fs_write_meta_pages(struct address_space *mapping, mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write -= written; return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + return 0; } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e3b7cfa17b99..3bc380942068 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -870,7 +870,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) - return 0; + goto skip_write; if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; @@ -892,6 +892,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, wbc->nr_to_write -= excess_nrtw; return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cb514f1896ab..7cc146bcbfed 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1209,7 +1209,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, /* collect a number of dirty node pages and write together */ if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) - return 0; + goto skip_write; /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = 3 * max_hw_blocks(sbi); @@ -1218,6 +1218,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - wbc->nr_to_write); return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + return 0; } static int f2fs_set_node_page_dirty(struct page *page) From 50c8cdb35ad8016c52fb2326ef9d65542e3a3e1b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:47:11 +0900 Subject: [PATCH 43/62] f2fs: introduce nr_pages_to_write for segment alignment This patch introduces nr_pages_to_write to align page writes to the segment or other operational unit size, which can be tuned according to the system environment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- fs/f2fs/data.c | 12 +++--------- fs/f2fs/node.c | 8 +++----- fs/f2fs/segment.h | 24 ++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index aef32f36e2f3..2a00c94726fb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -187,18 +187,19 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = nr_pages_to_skip(sbi, META); - long written; + long diff, written; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nrpages) + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, nrpages); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; skip_write: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3bc380942068..b0c923aef229 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -844,8 +844,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -862,7 +860,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -872,11 +870,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) goto skip_write; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); @@ -890,7 +884,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7cc146bcbfed..5e9c38e846a5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1202,7 +1202,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + long diff; /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1211,12 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = 3 * max_hw_blocks(sbi); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index bbd976100d14..9fc46ee27bb8 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -683,3 +683,27 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else return 0; } + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} From 3cb5ad152b54430f3e5f338c15f8cd434e7160c8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Mar 2014 13:29:07 +0900 Subject: [PATCH 44/62] f2fs: call f2fs_wait_on_page_writeback instead of native function If a page is on writeback, f2fs can face with deadlock due to under writepages. This is caused by merging IOs inside f2fs, so if it comes to detect, let's throw merged IOs, which is implemented by f2fs_wait_on_page_writeback. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++---- fs/f2fs/dir.c | 9 +++------ fs/f2fs/file.c | 6 +++--- fs/f2fs/node.c | 12 +++++++----- fs/f2fs/recovery.c | 2 +- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2a00c94726fb..a80be5121e8a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) { cond_resched(); goto repeat; } - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); SetPageUptodate(page); return page; } @@ -168,7 +166,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) goto no_write; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); no_write: dec_page_count(sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f3a80ce9ddf5..7c9b17c03675 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -253,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -352,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; - - wait_on_page_writeback(page); } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; - wait_on_page_writeback(page); set_cold_node(inode, page); } @@ -494,7 +491,7 @@ start: ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { @@ -543,7 +540,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a4cc1d6bdd84..e755ee57e042 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -245,7 +245,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); return; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); @@ -422,7 +422,7 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5e9c38e846a5..9a6d8bbf0bd7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -725,7 +725,7 @@ skip_partial: f2fs_put_page(page, 1); goto restart; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -814,7 +814,8 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + dn->nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -910,7 +911,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *page; int err; repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -1130,7 +1132,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); if (TestClearPageError(page)) ret = -EIO; } @@ -1163,7 +1165,7 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 03b28ec4c2dc..bbef4ed157a7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -316,7 +316,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(ni.ino != ino_of_node(page)); From 40bb0058c871c6ddcd4aff9fe2f5224e59aba47b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 10:43:59 +0900 Subject: [PATCH 45/62] f2fs: avoid to drop nat entries due to the negative nr_shrink The try_to_free_nats should not receive the negative nr_shrink. Otherwise, it can drop all the nat entries by the while loop. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9a6d8bbf0bd7..d27e65a1fb0b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -208,7 +208,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD || nr_shrink <= 0) return 0; write_lock(&nm_i->nat_tree_lock); From cdfc41c134d48c1923066bcfa6630b94588ad6bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:31:37 +0900 Subject: [PATCH 46/62] f2fs: throttle the memory footprint with a sysfs entry This patch introduces ram_thresh, a sysfs entry, which controls the memory footprint used by the free nid list and the nat cache. Previously, the free nid list was controlled by MAX_FREE_NIDS, while the nat cache was managed by NM_WOUT_THRESHOLD. However, this approach cannot be applied dynamically according to the system. So, this patch adds ram_thresh that users can specify the threshold, which is in order of 1 / 1024. For example, if the total ram size is 4GB and the value is set to 10 by default, f2fs tries to control the number of free nids and nat caches not to consume over 10 * (4GB / 1024) = 10MB. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ Documentation/filesystems/f2fs.txt | 4 ++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 23 ++++++++++++++++++++--- fs/f2fs/node.h | 11 ++++++++--- fs/f2fs/super.c | 5 +++++ 6 files changed, 44 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 32b0809203dd..4ac2c25025ea 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -55,3 +55,9 @@ Date: January 2014 Contact: "Jaegeuk Kim" Description: Controls the number of trials to find a victim segment. + +What: /sys/fs/f2fs//ram_thresh +Date: March 2014 +Contact: "Jaegeuk Kim" +Description: + Controls the memory footprint used by f2fs. diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 803784e1e8ef..f415b9fc4cc8 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -202,6 +202,10 @@ Files in /sys/fs/f2fs/ Otherwise, it needs to decrease this value to reduce the space overhead. The default value is 0. + ram_thresh This parameter controls the memory footprint used + by free nids and cached nat entries. By default, + 10 is set, which indicates 10 MB / 1 GB RAM. + ================================================================================ USAGE ================================================================================ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 59fac1a9f0c2..05c652493f04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -242,6 +242,7 @@ struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d27e65a1fb0b..fec4967fb8d2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -26,6 +26,22 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) +{ + struct sysinfo val; + unsigned long mem_size = 0; + + si_meminfo(&val); + if (type == FREE_NIDS) + mem_size = nm_i->fcnt * sizeof(struct free_nid); + else if (type == NAT_ENTRIES) + mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); + mem_size >>= 12; + + /* give 50:50 memory for free nids and nat caches respectively */ + return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -208,7 +224,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD || nr_shrink <= 0) + if (available_free_memory(nm_i, NAT_ENTRIES) || nr_shrink <= 0) return 0; write_lock(&nm_i->nat_tree_lock); @@ -1288,7 +1304,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) struct nat_entry *ne; bool allocated = false; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + if (!available_free_memory(nm_i, FREE_NIDS)) return -1; /* 0 nid should not be used */ @@ -1473,7 +1489,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { + if (!available_free_memory(nm_i, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); } else { i->state = NID_NEW; @@ -1836,6 +1852,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ee6d28684ee8..c97215432100 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,15 +17,15 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* maximum cached nat entries to manage memory footprint */ #define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -77,6 +77,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +enum nid_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES /* indicates the cached nat entry */ +}; + /* * For free nid mangement */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dbe402b1a4b7..34c47b2010bc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -74,6 +74,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ }; @@ -92,6 +93,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; return NULL; @@ -183,6 +186,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -198,6 +202,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), NULL, }; From a5f420101db326e27ef5c2ab737c8c1b0e3559e3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:45:52 +0900 Subject: [PATCH 47/62] f2fs: remove unnecessary threshold The NM_WOUT_THRESHOLD is now obsolete since f2fs starts to control on a basis of the memory footprint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++++---- fs/f2fs/node.c | 5 +---- fs/f2fs/node.h | 3 --- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b7111c44a918..b52c12cf5873 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -250,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs: %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fec4967fb8d2..daf644c57eae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -224,7 +224,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (available_free_memory(nm_i, NAT_ENTRIES) || nr_shrink <= 0) + if (available_free_memory(nm_i, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -1830,9 +1830,6 @@ flush_now: if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c97215432100..c2015b71ff87 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -20,9 +20,6 @@ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) - /* control the memory footprint threshold (10MB per 1GB ram) */ #define DEF_RAM_THRESHOLD 10 From 9179682591c231cc77840d4e75929cc1fa573365 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 13:40:09 +0900 Subject: [PATCH 48/62] f2fs: add missing documentation for dir_level This patch adds missing dir_level documentation. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4ac2c25025ea..62dd72522d6e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -56,6 +56,12 @@ Contact: "Jaegeuk Kim" Description: Controls the number of trials to find a victim segment. +What: /sys/fs/f2fs//dir_level +Date: March 2014 +Contact: "Jaegeuk Kim" +Description: + Controls the directory level for large directory. + What: /sys/fs/f2fs//ram_thresh Date: March 2014 Contact: "Jaegeuk Kim" From 58c410351eba3d24f741c85a0eb9eaf15c94047d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Mar 2014 14:17:21 +0900 Subject: [PATCH 49/62] f2fs: change reclaim rate in percentage It is more reasonable to determine the reclaiming rate of prefree segments according to the volume size, which is set to 5% by default. For example, if the volume is 128GB, the prefree segments are reclaimed when the number reaches to 6.4GB. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 8 +++++--- fs/f2fs/segment.c | 3 ++- fs/f2fs/segment.h | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index f415b9fc4cc8..2f6d0218dd22 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -169,9 +169,11 @@ Files in /sys/fs/f2fs/ reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree - segments is larger than this number, f2fs tries to - conduct checkpoint to reclaim the prefree segments - to free segments. By default, 100 segments, 200MB. + segments is larger than the number of segments + in the proportion to the percentage over total + volume size, f2fs tries to conduct checkpoint to + reclaim the prefree segments to free segments. + By default, 5% over total # of segments. max_small_discards This parameter controls the number of discard commands that consist small blocks less than 2MB. diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6c5a4f0218ca..e7ff23a536a4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1758,7 +1758,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); - sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; sm_info->ipu_policy = F2FS_IPU_DISABLE; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9fc46ee27bb8..7091204680f4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,7 +14,7 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) -#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) From d928bfbfe77aa457b765c19e9db8cd4cc72b3c89 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 19:10:08 +0900 Subject: [PATCH 50/62] f2fs: introduce fi->i_sem to protect fi's info This patch introduces fi->i_sem to protect fi's info that includes xattr_ver, pino, i_nlink. This enables to remove i_mutex during f2fs_sync_file, resulting in performance improvement when a number of fsync calls are triggered from many concurrent threads. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 14 ++++++++++---- fs/f2fs/namei.c | 7 +++++++ fs/f2fs/super.c | 1 + fs/f2fs/xattr.c | 3 +++ 6 files changed, 28 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c9b17c03675..972fd0ef230f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -493,6 +493,7 @@ start: add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -515,6 +516,8 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: + up_write(&F2FS_I(inode)->i_sem); + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode_page(dir); clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -559,6 +562,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + down_write(&F2FS_I(inode)->i_sem); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -569,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } + up_write(&F2FS_I(inode)->i_sem); update_inode_page(inode); if (inode->i_nlink == 0) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 05c652493f04..469779ab1d77 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -210,6 +210,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e755ee57e042..a9474cd4e0db 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -111,6 +111,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int ret = 0; bool need_cp = false; @@ -133,7 +134,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); + down_read(&fi->i_sem); /* * Both of fdatasync() and fsync() are able to be recovered from @@ -150,21 +151,27 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + up_read(&fi->i_sem); + if (need_cp) { nid_t pino; - F2FS_I(inode)->xattr_ver = 0; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { F2FS_I(inode)->i_pino = pino; file_got_pino(inode); + up_write(&fi->i_sem); mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + } else { + up_write(&fi->i_sem); } } else { /* if there is no written node page, write its inode page */ @@ -180,7 +187,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: - mutex_unlock(&inode->i_mutex); trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 397d459e97bf..0cea87437a60 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -424,12 +424,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) @@ -459,7 +464,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 34c47b2010bc..89ea046c846d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -360,6 +360,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 89d0422a91a8..84191307a670 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -590,7 +590,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); return err; From 479f40c44ae30e02642ce0391be707a53852d545 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 21:52:53 +0900 Subject: [PATCH 51/62] f2fs: skip unnecessary node writes during fsync If multiple redundant fsync calls are triggered, we don't need to write its node pages with fsync mark continuously. So, this patch adds FI_NEED_FSYNC to track whether the latest node block is written with the fsync mark or not. If the mark was set, a new fsync doesn't need to write a node block. Otherwise, we should do a new node block with the mark for roll-forward recovery. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 2 ++ fs/f2fs/node.c | 37 ++++++++++++++++++++++++++++--------- fs/f2fs/node.h | 1 + 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 469779ab1d77..f83433e4b043 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1126,6 +1126,7 @@ struct dnode_of_data; struct node_info; int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a9474cd4e0db..6ba26680c468 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -176,6 +176,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index daf644c57eae..eced8d7bf502 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -133,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -173,7 +187,7 @@ retry: } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -217,6 +231,11 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } @@ -483,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -846,7 +865,7 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); @@ -1202,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); unlock_page(page); @@ -1503,7 +1522,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } @@ -1559,7 +1578,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) f2fs_bug_on(ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: allocate new xattr nid */ @@ -1569,12 +1588,12 @@ recover_xnid: remove_free_nid(NM_I(sbi), new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR); + set_node_addr(sbi, &ni, NEW_ADDR, false); F2FS_I(inode)->i_xattr_nid = new_xnid; /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); update_inode_page(inode); return true; @@ -1612,7 +1631,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); f2fs_put_page(ipage, 1); return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c2015b71ff87..5decc1a375f0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -42,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; From 808a1d7490671a74ffa077cf92779c7b0c9f66da Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Mar 2014 22:21:08 +0900 Subject: [PATCH 52/62] f2fs: avoid RECLAIM_FS-ON-W warning This patch should resolve the following possible bug. RECLAIM_FS-ON-W at: mark_held_locks+0xb9/0x140 lockdep_trace_alloc+0x85/0xf0 __kmalloc+0x53/0x1d0 read_all_xattrs+0x3d1/0x3f0 [f2fs] f2fs_getxattr+0x4f/0x100 [f2fs] f2fs_get_acl+0x4c/0x290 [f2fs] get_acl+0x4f/0x80 posix_acl_create+0x72/0x180 f2fs_init_acl+0x29/0xcc [f2fs] __f2fs_add_link+0x259/0x710 [f2fs] f2fs_create+0xad/0x1c0 [f2fs] vfs_create+0xed/0x150 do_last+0xd36/0xed0 path_openat+0xc5/0x680 do_filp_open+0x43/0xa0 do_sys_open+0x13c/0x230 SyS_creat+0x1e/0x20 system_call_fastpath+0x16/0x1b Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/xattr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fa8da4cb8c4b..a28571528f24 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 84191307a670..0121e4595ccd 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_size = inline_xattr_size(inode); - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) return NULL; From 3bb5e2c8fe2296ddd9d864dcfb5ee1b77135f3ec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 1 Apr 2014 17:38:26 +0900 Subject: [PATCH 53/62] f2fs: return -EIO when node id is not matched During the cleaing of node segments, F2FS can get errored node blocks due to data race between node page lock and its valid bitmap operations. In that case, it needs to return an error to skip such the obsolete block copy. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index eced8d7bf502..065cd99cc723 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -958,7 +958,7 @@ repeat: goto got_it; lock_page(page); - if (unlikely(!PageUptodate(page))) { + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -967,7 +967,6 @@ repeat: goto repeat; } got_it: - f2fs_bug_on(nid != nid_of_node(page)); mark_page_accessed(page); return page; } From df0f8dc0e154de13e3a54846f384b674dd557c85 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Mar 2014 14:57:23 +0800 Subject: [PATCH 54/62] f2fs: avoid unnecessary bio submit when wait page writeback This patch introduce is_merged_page() to check whether current page is merged in f2fs bio cache. When page is not in cache, we can avoid submitting bio cache, resulting in having more chance to merge pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 28 +++++++++++++++++++++++++++- fs/f2fs/super.c | 4 ++-- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b0c923aef229..598bfa617a7e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -134,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { @@ -142,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); } /* @@ -180,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, verify_block_addr(sbi, blk_addr); - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); if (!is_read) inc_page_count(sbi, F2FS_WRITEBACK); @@ -204,7 +204,7 @@ alloc_new: io->last_block_in_bio = blk_addr; - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f83433e4b043..1e3d869b60cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -394,7 +394,7 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct mutex io_mutex; /* mutex for bio */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ }; struct f2fs_sb_info { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e7ff23a536a4..570ab9a084c5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1046,12 +1046,38 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio *bio = io->bio; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!bio) + goto out; + + bio_for_each_segment_all(bvec, bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { - f2fs_submit_merged_bio(sbi, type, WRITE); + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 89ea046c846d..959834066d60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -920,11 +920,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - mutex_init(&sbi->read_io.io_mutex); + init_rwsem(&sbi->read_io.io_rwsem); sbi->read_io.sbi = sbi; sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { - mutex_init(&sbi->write_io[i].io_mutex); + init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; sbi->write_io[i].bio = NULL; } From 6e452d69d421e10e99446c6de16a19604149c40f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Mar 2014 14:59:45 +0800 Subject: [PATCH 55/62] f2fs: avoid unneeded lookup when xattr name length is too long In f2fs_setxattr we have limit this attribute name length, so we should also check it in f2fs_getxattr to avoid useless lookup caused by invalid name length. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 0121e4595ccd..503c2451131e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; name_len = strlen(name); + if (name_len > F2FS_NAME_LEN) + return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) From cf0ee0f09bc09f54b9852dda1088b9cdcd4f8683 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Apr 2014 08:55:00 +0800 Subject: [PATCH 56/62] f2fs: avoid free slab cache under spinlock Move kmem_cache_free out of spinlock protection region for better performance. Change log from v1: o remove spinlock protection for kmem_cache_free in destroy_node_manager suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 27 +++++++++++++++++---------- fs/f2fs/node.c | 17 ++++++++++++++++- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a80be5121e8a..d877f46c75ed 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -347,10 +347,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } spin_unlock(&sbi->orphan_inode_lock); @@ -577,6 +578,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; @@ -586,12 +588,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void add_dirty_dir_inode(struct inode *inode) @@ -599,20 +602,22 @@ void add_dirty_dir_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *this, *head; if (!S_ISDIR(inode->i_mode)) @@ -630,13 +635,15 @@ void remove_dirty_dir_inode(struct inode *inode) entry = list_entry(this, struct dir_inode_entry, list); if (entry->inode == inode) { list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); stat_dec_dirty_dir(sbi); - break; + spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + goto done; } } spin_unlock(&sbi->dir_inode_lock); +done: /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 065cd99cc723..4b27e36e40fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1313,7 +1313,6 @@ static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, { list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); - kmem_cache_free(free_nid_slab, i); } static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) @@ -1360,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } static void scan_nat_page(struct f2fs_nm_info *nm_i, @@ -1491,6 +1496,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(!i || i->state != NID_ALLOC); __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1500,6 +1507,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; if (!nid) return; @@ -1509,11 +1517,15 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(!i || i->state != NID_ALLOC); if (!available_free_memory(nm_i, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); + need_free = true; } else { i->state = NID_NEW; nm_i->fcnt++; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1925,6 +1937,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) f2fs_bug_on(i->state == NID_ALLOC); __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); From 2d7b822ad9daf0ea903accacaa89340ddd3f201f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Mar 2014 11:33:17 +0800 Subject: [PATCH 57/62] f2fs: use list_for_each_entry{_safe} for simplyfying code This patch use list_for_each_entry{_safe} instead of list_for_each{_safe} for simplfying code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 37 ++++++++++++++----------------------- fs/f2fs/node.c | 16 ++++++---------- fs/f2fs/recovery.c | 6 ++---- fs/f2fs/segment.c | 6 ++---- 4 files changed, 24 insertions(+), 41 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d877f46c75ed..4aa521aa9bc3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -308,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); new->ino = ino; spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { spin_unlock(&sbi->orphan_inode_lock); kmem_cache_free(orphan_entry_slab, new); @@ -326,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (orphan->ino > ino) break; - orphan = NULL; } - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); spin_unlock(&sbi->orphan_inode_lock); } @@ -561,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) if (unlikely(entry->inode == inode)) return -EEXIST; - } + list_add_tail(&new->list, head); stat_inc_dirty_dir(sbi); return 0; @@ -618,7 +611,8 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *this, *head; + struct list_head *head; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; @@ -630,9 +624,7 @@ void remove_dirty_dir_inode(struct inode *inode) } head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode == inode) { list_del(&entry->list); stat_dec_dirty_dir(sbi); @@ -654,15 +646,14 @@ done: struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *head; + struct list_head *head; struct inode *inode = NULL; + struct dir_inode_entry *entry; spin_lock(&sbi->dir_inode_lock); head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode->i_ino == ino) { inode = entry->inode; break; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b27e36e40fc..a161e955c4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1451,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; @@ -1461,11 +1460,9 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); + list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - } f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; @@ -1780,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1792,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; block_t new_blkaddr; - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); - if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index bbef4ed157a7..b1ae89f0f44e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 570ab9a084c5..cb49e6390ffa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -340,8 +340,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->discard_list); - struct list_head *this, *next; - struct discard_entry *entry; + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -370,8 +369,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ - list_for_each_safe(this, next, head) { - entry = list_entry(this, struct discard_entry, list); + list_for_each_entry_safe(entry, this, head, list) { f2fs_issue_discard(sbi, entry->blkaddr, entry->len); list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; From d54c795b495b9bd417e482286c832c9a8eb210ae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Mar 2014 15:30:40 +0800 Subject: [PATCH 58/62] f2fs: fix error path when fail to read inline data We should unlock page in ->readpage() path and also should unlock & release page in error path of ->write_begin() to avoid deadlock or memory leak. So let's add release code to fix the problem when we fail to read inline data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++++++++---- fs/f2fs/inline.c | 4 +++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 598bfa617a7e..45abd60e2bff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -942,13 +942,19 @@ inline_data: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_read_inline_data(inode, page); - else + if (err) { + page_cache_release(page); + return err; + } + } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return err; + if (err) + return err; + } + lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 31ee5b164ff9..383db1fabcf4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + unlock_page(page); return PTR_ERR(ipage); + } zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); From ce23447fe5764391025a67c20c97eaf5c6ac1ec3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Apr 2014 09:04:42 +0900 Subject: [PATCH 59/62] f2fs: fix to cover io->bio with io_rwsem In the f2fs_wait_on_page_writeback, io->bio should be covered by io_rwsem. Otherwise, the bio pointer can become a dangling pointer due to data races. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cb49e6390ffa..f799c6a34c39 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1049,15 +1049,14 @@ static inline bool is_merged_page(struct f2fs_sb_info *sbi, { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio *bio = io->bio; struct bio_vec *bvec; int i; down_read(&io->io_rwsem); - if (!bio) + if (!io->bio) goto out; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, io->bio, i) { if (page == bvec->bv_page) { up_read(&io->io_rwsem); return true; From 6b4afdd794783fe515b50838aa36591e3feea990 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Apr 2014 15:34:36 +0900 Subject: [PATCH 60/62] f2fs: introduce f2fs_issue_flush to avoid redundant flush issue Some storage devices show relatively high latencies to complete cache_flush commands, even though their normal IO speed is prettry much high. In such the case, it needs to merge cache_flush commands as much as possible to avoid issuing them redundantly. So, this patch introduces a mount option, "-o flush_merge", to mitigate such the overhead. If this option is enabled by user, F2FS merges the cache_flush commands and then issues just one cache_flush on behalf of them. Once the single command is finished, F2FS sends a completion signal to all the pending threads. Note that, this option can be used under a workload consisting of very intensive concurrent fsync calls, while the storage handles cache_flush commands slowly. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 4 ++ fs/f2fs/f2fs.h | 16 ++++++ fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 89 ++++++++++++++++++++++++++++++ fs/f2fs/super.c | 7 +++ 5 files changed, 117 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 2f6d0218dd22..25311e113e75 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -122,6 +122,10 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs inline_xattr Enable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. +flush_merge Merge concurrent cache_flush commands as much as possible + to eliminate redundant command issues. If the underlying + device handles the cache_flush command relatively slowly, + recommend to enable this option. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e3d869b60cd..2ecac8312359 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -40,6 +40,7 @@ #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -316,6 +317,12 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ @@ -344,6 +351,14 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ }; /* @@ -1160,6 +1175,7 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6ba26680c468..302d552afea5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -186,7 +186,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f799c6a34c39..085f548be7a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *flush_cmd_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) f2fs_sync_fs(sbi->sb, true); } +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_sm_info *sm_i = SM_I(sbi); + wait_queue_head_t *q = &sm_i->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) { + sm_i->dispatch_list = sm_i->issue_list; + sm_i->issue_list = sm_i->issue_tail = NULL; + } + spin_unlock(&sm_i->issue_lock); + + if (sm_i->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = sm_i->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + sm_i->dispatch_list = NULL; + } + + wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + struct flush_cmd *cmd; + int ret; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); + cmd->next = NULL; + cmd->ret = 0; + init_completion(&cmd->wait); + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) + sm_i->issue_tail->next = cmd; + else + sm_i->issue_list = cmd; + sm_i->issue_tail = cmd; + spin_unlock(&sm_i->issue_lock); + + if (!sm_i->dispatch_list) + wake_up(&sm_i->flush_wait_queue); + + wait_for_completion(&cmd->wait); + ret = cmd->ret; + kmem_cache_free(flush_cmd_slab, cmd); + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -1763,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + dev_t dev = sbi->sb->s_bdev->bd_dev; struct f2fs_sm_info *sm_info; int err; @@ -1790,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; + if (test_opt(sbi, FLUSH_MERGE)) { + spin_lock_init(&sm_info->issue_lock); + init_waitqueue_head(&sm_info->flush_wait_queue); + + sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(sm_info->f2fs_issue_flush)) + return PTR_ERR(sm_info->f2fs_issue_flush); + } + err = build_sit_info(sbi); if (err) return err; @@ -1898,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; + if (sm_info->f2fs_issue_flush) + kthread_stop(sm_info->f2fs_issue_flush); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1912,10 +1994,17 @@ int __init create_segment_manager_caches(void) sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; + flush_cmd_slab = f2fs_kmem_cache_create("flush_command", + sizeof(struct flush_cmd)); + if (!flush_cmd_slab) { + kmem_cache_destroy(discard_entry_slab); + return -ENOMEM; + } return 0; } void destroy_segment_manager_caches(void) { kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(flush_cmd_slab); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 959834066d60..d31b767fde73 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -51,6 +51,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_inline_data, + Opt_flush_merge, Opt_err, }; @@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; @@ -334,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -537,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + if (test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; From 3a8861e2715e3b985bfaac43bcdfcfebe9b423cb Mon Sep 17 00:00:00 2001 From: ZhangZhen Date: Fri, 4 Apr 2014 09:47:16 +0800 Subject: [PATCH 61/62] f2fs: check the acl's validity before setting Before setting the acl, call posix_acl_valid() to check if it is valid or not. Signed-off-by: zhangzhen Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a28571528f24..e93e4ec7d165 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } + switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; From 48b230a583965d33c32b4e3c29a1e5e15d7e55de Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Apr 2014 11:18:34 +0800 Subject: [PATCH 62/62] f2fs: fix wrong statistics of inline data If we remove a file that has inline data after mount, our statistics turns to inaccurate. cat /sys/kernel/debug/f2fs/status - Inline_data Inode: 4294967295 Let's add stat_inc_inline_inode() to stat inline info of the file when lookup. Change log from v1: o stat in f2fs_lookup() instead of in do_read_inode() for excluding wrong stat. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0cea87437a60..a9409d19dfd4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry);