From 8018ec083c72443cc74fd2d08eb7c5dddc13af53 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 9 Sep 2014 11:44:46 +1000 Subject: [PATCH 01/56] xfs: mark all internal workqueues as freezable Workqueues must be explicitly set as freezable to ensure they are frozen in the assocated part of the hibernation/suspend sequence. Freezing of workqueues and kernel threads is important to ensure that modifications are not made on-disk after the hibernation image has been created. Otherwise, the in-memory state can become inconsistent with what is on disk and eventually lead to filesystem corruption. We have reports of free space btree corruptions that occur immediately after restore from hibernate that suggest the xfs-eofblocks workqueue could be causing such problems if it races with hibernation. Mark all of the internal XFS workqueues as freezable to ensure nothing changes on-disk once the freezer infrastructure freezes kernel threads and creates the hibernation image. Signed-off-by: Brian Foster Reported-by: Carlos E. R. Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_mru_cache.c | 3 ++- fs/xfs/xfs_super.c | 15 ++++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca9b064..ec6505056b2c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1884,7 +1884,7 @@ xfs_buf_init(void) goto out; xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 1eb6f3df698c..30ecca3037e3 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -304,7 +304,8 @@ _xfs_mru_cache_reap( int xfs_mru_cache_init(void) { - xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b194652033cd..bc9ec44cae45 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -838,32 +838,32 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_data_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_unwritten_workqueue) goto out_destroy_data_iodone_queue; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; @@ -1715,7 +1715,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); if (!xfs_alloc_wq) return -ENOMEM; From e1b05723ed834090caab56866adc05bce31c9bdd Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:47:24 +1000 Subject: [PATCH 02/56] xfs: add a few more verifier tests These were exposed by fsfuzzer runs; without them we fail in various exciting and sometimes convoluted ways when we encounter disk corruption. Without the MAXLEVELS tests we tend to walk off the end of an array in a loop like this: for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) Without the dirblklog test we try to allocate more memory than we could possibly hope for and loop forever: xfs_dabuf_map() nfsb = mp->m_dir_geo->fsbcount; irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP... As for the logbsize check, that's the convoluted one. If logbsize is specified at mount time, it's sanitized in xfs_parseargs; in particular it makes sure that it's not > XLOG_MAX_RECORD_BSIZE. If not specified at mount time, it comes from the superblock via sb_logsunit; this is limited to 256k at mkfs time as well; it's copied into m_logbsize in xfs_finish_flags(). However, if for some reason the on-disk value is corrupt and too large, nothing catches it. It's a circuitous path, but that size eventually finds its way to places that make the kernel very unhappy, leading to oopses in xlog_pack_data() because we use the size as an index into iclog->ic_data, but the array is not necessarily that big. Anyway - bounds checking when we read from disk is a good thing! Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 4 ++++ fs/xfs/libxfs/xfs_ialloc.c | 2 ++ fs/xfs/libxfs/xfs_sb.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4bffffe038a1..eff34218f405 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2209,6 +2209,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b62771f1f4b5..d213a2eae95e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2051,6 +2051,8 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; + if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ad525a5623a4..8426e5e2682e 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -279,11 +279,13 @@ xfs_mount_validate_sb( sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || From 65b65735fede29b516fed1d8c2391e8bc373b805 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 9 Sep 2014 11:52:42 +1000 Subject: [PATCH 03/56] xfs: add debug sysfs attribute set Create a top-level debug directory for global debug sysfs attributes. This directory is added and removed on XFS module initialization and removal respectively for DEBUG mode kernels only. It typically resides at /sys/fs/xfs/debug. It is located at the top level of the xfs sysfs hierarchy as attributes might define global behavior or behavior that must be configured before an xfs mount is available (e.g., log recovery behavior). Define the global debug kobject that represents the debug sysfs directory and add generic attribute show/store helpers to support future attributes. No debug attributes are exported as of yet. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 23 +++++++++++++++++++++-- fs/xfs/xfs_sysfs.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sysfs.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bc9ec44cae45..dcd4b93dccdc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,7 @@ #include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_quota.h" +#include "xfs_sysfs.h" #include #include @@ -61,7 +62,11 @@ static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; -struct kset *xfs_kset; + +struct kset *xfs_kset; /* top-level xfs sysfs dir */ +#ifdef DEBUG +static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ +#endif #define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ #define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ @@ -1769,9 +1774,16 @@ init_xfs_fs(void) goto out_sysctl_unregister;; } - error = xfs_qm_init(); +#ifdef DEBUG + xfs_dbg_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) goto out_kset_unregister; +#endif + + error = xfs_qm_init(); + if (error) + goto out_remove_kobj; error = register_filesystem(&xfs_fs_type); if (error) @@ -1780,7 +1792,11 @@ init_xfs_fs(void) out_qm_exit: xfs_qm_exit(); + out_remove_kobj: +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); out_kset_unregister: +#endif kset_unregister(xfs_kset); out_sysctl_unregister: xfs_sysctl_unregister(); @@ -1803,6 +1819,9 @@ exit_xfs_fs(void) { xfs_qm_exit(); unregister_filesystem(&xfs_fs_type); +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); +#endif kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 9835139ce1ec..32ddf0c8c50e 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -51,6 +51,49 @@ struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, }; +#ifdef DEBUG +/* debug */ + +static struct attribute *xfs_dbg_attrs[] = { + NULL, +}; + +STATIC ssize_t +xfs_dbg_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; +} + +STATIC ssize_t +xfs_dbg_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; +} + +static struct sysfs_ops xfs_dbg_ops = { + .show = xfs_dbg_show, + .store = xfs_dbg_store, +}; + +struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_dbg_ops, + .default_attrs = xfs_dbg_attrs, +}; + +#endif /* DEBUG */ + /* xlog */ STATIC ssize_t diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 54a2091183c0..240eee35f342 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -20,6 +20,7 @@ #define __XFS_SYSFS_H__ extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern struct kobj_type xfs_dbg_ktype; /* debug */ extern struct kobj_type xfs_log_ktype; /* xlog */ static inline struct xfs_kobj * From 2e2271787419a12496bf5da5c3028a9c73c9697f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 9 Sep 2014 11:56:13 +1000 Subject: [PATCH 04/56] xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_globals.c | 4 ++++ fs/xfs/xfs_log_recover.c | 12 ++++++++++++ fs/xfs/xfs_sysctl.h | 5 +++++ fs/xfs/xfs_sysfs.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5399ef222dd7..4d41b241298f 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -43,3 +43,7 @@ xfs_param_t xfs_params = { .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, }; + +struct xfs_globals xfs_globals = { + .log_recovery_delay = 0, /* no delay by default */ +}; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fd5787add99..176c4b3609ab 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4509,6 +4509,18 @@ xlog_recover( return -EINVAL; } + /* + * Delay log recovery if the debug hook is set. This is debug + * instrumention to coordinate simulation of I/O failures with + * log recovery. + */ + if (xfs_globals.log_recovery_delay) { + xfs_notice(log->l_mp, + "Delaying log recovery for %d seconds.", + xfs_globals.log_recovery_delay); + msleep(xfs_globals.log_recovery_delay * 1000); + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index bd8e157c20ef..ffef45375754 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -92,6 +92,11 @@ enum { extern xfs_param_t xfs_params; +struct xfs_globals { + int log_recovery_delay; /* log recovery delay (secs) */ +}; +extern struct xfs_globals xfs_globals; + #ifdef CONFIG_SYSCTL extern int xfs_sysctl_register(void); extern void xfs_sysctl_unregister(void); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 32ddf0c8c50e..aa03670851d8 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -54,7 +54,38 @@ struct kobj_type xfs_mp_ktype = { #ifdef DEBUG /* debug */ +STATIC ssize_t +log_recovery_delay_store( + const char *buf, + size_t count, + void *data) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.log_recovery_delay = val; + + return count; +} + +STATIC ssize_t +log_recovery_delay_show( + char *buf, + void *data) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); +} +XFS_SYSFS_ATTR_RW(log_recovery_delay); + static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(log_recovery_delay), NULL, }; From 49c69591c80648c14ff87525e97ee6ebe3a343cb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:56:48 +1000 Subject: [PATCH 05/56] xfs: combine xfs_seek_hole & xfs_seek_data xfs_seek_hole & xfs_seek_data are remarkably similar; so much so that they can be combined, saving a fair bit of semi-complex code duplication. The following patch passes generic/285 and generic/286, which specifically test seek behavior. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 182 ++++++++++++++-------------------------------- 1 file changed, 55 insertions(+), 127 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 076b1708d134..1da3b7db5bf7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -964,7 +964,7 @@ xfs_vm_page_mkwrite( /* * This type is designed to indicate the type of offset we would like - * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + * to search from page cache for xfs_seek_hole_data(). */ enum { HOLE_OFF = 0, @@ -1021,7 +1021,7 @@ xfs_lookup_buffer_offset( /* * This routine is called to find out and return a data or hole offset * from the page cache for unwritten extents according to the desired - * type for xfs_seek_data() or xfs_seek_hole(). + * type for xfs_seek_hole_data(). * * The argument offset is used to tell where we start to search from the * page cache. Map is used to figure out the end points of the range to @@ -1181,110 +1181,10 @@ out: } STATIC loff_t -xfs_seek_data( +xfs_seek_hole_data( struct file *file, - loff_t start) -{ - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fsize_t isize; - xfs_fileoff_t fsbno; - xfs_filblks_t end; - uint lock; - int error; - - lock = xfs_ilock_data_map_shared(ip); - - isize = i_size_read(inode); - if (start >= isize) { - error = -ENXIO; - goto out_unlock; - } - - /* - * Try to read extents from the first block indicated - * by fsbno to the end block of the file. - */ - fsbno = XFS_B_TO_FSBT(mp, start); - end = XFS_B_TO_FSB(mp, isize); - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; - - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; - - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = -ENXIO; - goto out_unlock; - } - - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in a data extent */ - if (map[i].br_startblock == DELAYSTARTBLOCK || - (map[i].br_state == XFS_EXT_NORM && - !isnullstartblock(map[i].br_startblock))) - goto out; - - /* - * Landed in an unwritten extent, try to search data - * from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - DATA_OFF, &offset)) - goto out; - } - } - - /* - * map[0] is hole or its an unwritten extent but - * without data in page cache. Probably means that - * we are reading after EOF if nothing in map[1]. - */ - if (nmap == 1) { - error = -ENXIO; - goto out_unlock; - } - - ASSERT(i > 1); - - /* - * Nothing was found, proceed to the next round of search - * if reading offset not beyond or hit EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= isize) { - error = -ENXIO; - goto out_unlock; - } - } - -out: - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock(ip, lock); - - if (error) - return error; - return offset; -} - -STATIC loff_t -xfs_seek_hole( - struct file *file, - loff_t start) + loff_t start, + int whence) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); @@ -1307,6 +1207,10 @@ xfs_seek_hole( goto out_unlock; } + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ fsbno = XFS_B_TO_FSBT(mp, start); end = XFS_B_TO_FSB(mp, isize); @@ -1330,55 +1234,80 @@ xfs_seek_hole( offset = max_t(loff_t, start, XFS_FSB_TO_B(mp, map[i].br_startoff)); - /* Landed in a hole */ - if (map[i].br_startblock == HOLESTARTBLOCK) + /* Landed in the hole we wanted? */ + if (whence == SEEK_HOLE && + map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* Landed in the data extent we wanted? */ + if (whence == SEEK_DATA && + (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock)))) goto out; /* - * Landed in an unwritten extent, try to search hole - * from page cache. + * Landed in an unwritten extent, try to search + * for hole or data from page cache. */ if (map[i].br_state == XFS_EXT_UNWRITTEN) { if (xfs_find_get_desired_pgoff(inode, &map[i], - HOLE_OFF, &offset)) + whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, + &offset)) goto out; } } /* - * map[0] contains data or its unwritten but contains - * data in page cache, probably means that we are - * reading after EOF. We should fix offset to point - * to the end of the file(i.e., there is an implicit - * hole at the end of any file). + * We only received one extent out of the two requested. This + * means we've hit EOF and didn't find what we are looking for. */ if (nmap == 1) { - offset = isize; - break; + /* + * If we were looking for a hole, set offset to + * the end of the file (i.e., there is an implicit + * hole at the end of any file). + */ + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + /* + * If we were looking for data, it's nowhere to be found + */ + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; } ASSERT(i > 1); /* - * Both mappings contains data, proceed to the next round of - * search if the current reading offset not beyond or hit EOF. + * Nothing was found, proceed to the next round of search + * if the next reading offset is not at or beyond EOF. */ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; start = XFS_FSB_TO_B(mp, fsbno); if (start >= isize) { - offset = isize; - break; + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; } } out: /* - * At this point, we must have found a hole. However, the returned + * If at this point we have found the hole we wanted, the returned * offset may be bigger than the file size as it may be aligned to - * page boundary for unwritten extents, we need to deal with this + * page boundary for unwritten extents. We need to deal with this * situation in particular. */ - offset = min_t(loff_t, offset, isize); + if (whence == SEEK_HOLE) + offset = min_t(loff_t, offset, isize); offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: @@ -1400,10 +1329,9 @@ xfs_file_llseek( case SEEK_CUR: case SEEK_SET: return generic_file_llseek(file, offset, origin); - case SEEK_DATA: - return xfs_seek_data(file, offset); case SEEK_HOLE: - return xfs_seek_hole(file, offset); + case SEEK_DATA: + return xfs_seek_hole_data(file, offset, origin); default: return -EINVAL; } From 59f9c004320704179913fa7c57645017ccf1b5c3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:57:10 +1000 Subject: [PATCH 06/56] xfs: lseek: the "whence" argument is called "whence" For some reason, the older commit: 965c8e5 lseek: the "whence" argument is called "whence" lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. left out xfs. So fix xfs. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1da3b7db5bf7..0fe36e4d5cef 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1322,16 +1322,16 @@ STATIC loff_t xfs_file_llseek( struct file *file, loff_t offset, - int origin) + int whence) { - switch (origin) { + switch (whence) { case SEEK_END: case SEEK_CUR: case SEEK_SET: - return generic_file_llseek(file, offset, origin); + return generic_file_llseek(file, offset, whence); case SEEK_HOLE: case SEEK_DATA: - return xfs_seek_hole_data(file, offset, origin); + return xfs_seek_hole_data(file, offset, whence); default: return -EINVAL; } From 970fd3f04d5949a4b5f6d0a5fea8e4b6797a5992 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:57:29 +1000 Subject: [PATCH 07/56] xfs: deduplicate xlog_do_recovery_pass() In xlog_do_recovery_pass(), there are 2 distinct cases: non-wrapped and wrapped log recovery. If we find a wrapped log, we recover around the end of the log, and then handle the rest of recovery exactly as in the non-wrapped case - using exactly the same (duplicated) code. Rather than having the same code in both cases, we can get the wrapped portion out of the way first if needed, and then recover the non-wrapped portion of the log. There should be no functional change here, just code reorganization & deduplication. The patch looks a bit bigger than it really is; the last hunk is whitespace changes (un-indenting). Tested with xfstests "check -g log" on a stock configuration. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 81 ++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 176c4b3609ab..29e101fc32c5 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4132,41 +4132,13 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - if (tail_blk <= head_blk) { - for (blk_no = tail_blk; blk_no < head_blk; ) { - error = xlog_bread(log, blk_no, hblks, hbp, &offset); - if (error) - goto bread_err2; - - rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); - if (error) - goto bread_err2; - - /* blocks in data section */ - bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no + hblks, bblks, dbp, - &offset); - if (error) - goto bread_err2; - - error = xlog_unpack_data(rhead, offset, log); - if (error) - goto bread_err2; - - error = xlog_recover_process_data(log, - rhash, rhead, offset, pass); - if (error) - goto bread_err2; - blk_no += bblks + hblks; - } - } else { + blk_no = tail_blk; + if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. * When the head is not on the same cycle number as the tail, - * we can't do a sequential recovery as above. + * we can't do a sequential recovery. */ - blk_no = tail_blk; while (blk_no < log->l_logBBsize) { /* * Check for header wrapping around physical end-of-log @@ -4280,34 +4252,35 @@ xlog_do_recovery_pass( ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + } - /* read first part of physical log */ - while (blk_no < head_blk) { - error = xlog_bread(log, blk_no, hblks, hbp, &offset); - if (error) - goto bread_err2; + /* read first part of physical log */ + while (blk_no < head_blk) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; - rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); - if (error) - goto bread_err2; + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; - bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no+hblks, bblks, dbp, - &offset); - if (error) - goto bread_err2; + /* blocks in data section */ + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); - if (error) - goto bread_err2; + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; - blk_no += bblks + hblks; - } + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks + hblks; } bread_err2: From 94f3cad555d66048906deade06a764f7ea2c6e4d Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:57:52 +1000 Subject: [PATCH 08/56] xfs: check resblks before calling xfs_dir_canenter Move the resblks test out of the xfs_dir_canenter, and into the caller. This makes a little more sense on the face of it; xfs_dir_canenter immediately returns if resblks !=0; and given some of the comments preceding the calls: * Check for ability to enter directory entry, if no space reserved. even more so. It also facilitates the next patch. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 7 +------ fs/xfs/libxfs/xfs_dir2.h | 2 +- fs/xfs/xfs_inode.c | 24 +++++++++++++++--------- fs/xfs/xfs_symlink.c | 8 +++++--- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 6cef22152fd6..ea84e1c47284 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -535,22 +535,17 @@ out_free: /* * See if this entry can be added to the directory without allocating space. - * First checks that the caller couldn't reserve enough space (resblks = 0). */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, - struct xfs_name *name, /* name of entry to add */ - uint resblks) + struct xfs_name *name) /* name of entry to add */ { struct xfs_da_args *args; int rval; int v; /* type-checking value */ - if (resblks) - return 0; - ASSERT(S_ISDIR(dp->i_d.di_mode)); args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c8e86b0b5e99..4dff261e6ed5 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, uint resblks); + struct xfs_name *name); /* * Direct call from the bmap code, bypassing the generic directory layer. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fea3c92fb3f0..c92cb48617d1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1153,9 +1153,11 @@ xfs_create( if (error) goto out_trans_cancel; - error = xfs_dir_canenter(tp, dp, name, resblks); - if (error) - goto out_trans_cancel; + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + goto out_trans_cancel; + } /* * A newly created regular or special file just has one directory @@ -1421,9 +1423,11 @@ xfs_link( goto error_return; } - error = xfs_dir_canenter(tp, tdp, target_name, resblks); - if (error) - goto error_return; + if (!resblks) { + error = xfs_dir_canenter(tp, tdp, target_name); + if (error) + goto error_return; + } xfs_bmap_init(&free_list, &first_block); @@ -2759,9 +2763,11 @@ xfs_rename( * If there's no space reservation, check the entry will * fit before actually inserting it. */ - error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); - if (error) - goto error_return; + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + goto error_return; + } /* * If target does not exist and the rename crosses * directories, adjust the target directory link count diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 6a944a2cd36f..02ae62a998e0 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -269,9 +269,11 @@ xfs_symlink( /* * Check for ability to enter directory entry, if no space reserved. */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; + if (!resblks) { + error = xfs_dir_canenter(tp, dp, link_name); + if (error) + goto error_return; + } /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. From b16ed7c114b8cca45fa87b675c431f43ff90c179 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:58:07 +1000 Subject: [PATCH 09/56] xfs: combine xfs_dir_canenter into xfs_dir_createname xfs_dir_canenter and xfs_dir_createname are almost identical. Fold the former into the latter, with a helpful wrapper for the former. If createname is called without an inode number, it now only checks for space, and does not actually add the entry. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 60 ++++++++-------------------------------- 1 file changed, 11 insertions(+), 49 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index ea84e1c47284..7075aaf131f4 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -237,7 +237,8 @@ xfs_dir_init( } /* - Enter a name in a directory. + * Enter a name in a directory, or check for available space. + * If inum is 0, only the available space test is performed. */ int xfs_dir_createname( @@ -254,10 +255,12 @@ xfs_dir_createname( int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - rval = xfs_dir_ino_validate(tp->t_mountp, inum); - if (rval) - return rval; - XFS_STATS_INC(xs_dir_create); + if (inum) { + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + XFS_STATS_INC(xs_dir_create); + } args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) @@ -276,6 +279,8 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + if (!inum) + args->op_flags |= XFS_DA_OP_JUSTCHECK; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); @@ -542,50 +547,7 @@ xfs_dir_canenter( xfs_inode_t *dp, struct xfs_name *name) /* name of entry to add */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - - ASSERT(S_ISDIR(dp->i_d.di_mode)); - - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); - if (!args) - return -ENOMEM; - - args->geo = dp->i_mount->m_dir_geo; - args->name = name->name; - args->namelen = name->len; - args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); - args->dp = dp; - args->whichfork = XFS_DATA_FORK; - args->trans = tp; - args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | - XFS_DA_OP_OKNOENT; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); -out_free: - kmem_free(args); - return rval; + return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0); } /* From afabfd30d05264ff493c24bce310b6a5350f099b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:58:42 +1000 Subject: [PATCH 10/56] xfs: combine xfs_rtmodify_summary and xfs_rtget_summary xfs_rtmodify_summary and xfs_rtget_summary are almost identical; fold them into xfs_rtmodify_summary_int(), with wrappers for each of the original calls. The _int function modifies if a delta is passed, and returns a summary pointer if *sum is passed. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_rtbitmap.c | 45 +++++++++++++++++++++++------ fs/xfs/xfs_rtalloc.c | 55 ++---------------------------------- fs/xfs/xfs_rtalloc.h | 4 +++ 3 files changed, 43 insertions(+), 61 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f4dd697cac08..50e3b9302722 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -424,20 +424,24 @@ xfs_rtfind_forw( } /* - * Read and modify the summary information for a given extent size, + * Read and/or modify the summary information for a given extent size, * bitmap block combination. * Keeps track of a current summary block, so we don't keep reading * it from the buffer cache. + * + * Summary information is returned in *sum if specified. + * If no delta is specified, returns summary only. */ int -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount point */ +xfs_rtmodify_summary_int( + xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ int log, /* log2 of extent size */ xfs_rtblock_t bbno, /* bitmap block number */ int delta, /* change to make to summary info */ xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { xfs_buf_t *bp; /* buffer for the summary block */ int error; /* error value */ @@ -480,15 +484,40 @@ xfs_rtmodify_summary( } } /* - * Point to the summary information, modify and log it. + * Point to the summary information, modify/log it, and/or copy it out. */ sp = XFS_SUMPTR(mp, bp, so); - *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), - (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); + if (delta) { + uint first = (uint)((char *)sp - (char *)bp->b_addr); + + *sp += delta; + xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + } + if (sum) { + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + *sum = *sp; + } return 0; } +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, + delta, rbpp, rsb, NULL); +} + /* * Set the given range of bitmap bits to the given value. * Do whatever I/O and logging is required. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 909e143b87ae..d1160cce5dad 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -46,7 +46,7 @@ * Keeps track of a current summary block, so we don't keep reading * it from the buffer cache. */ -STATIC int /* error */ +int xfs_rtget_summary( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -56,60 +56,9 @@ xfs_rtget_summary( xfs_fsblock_t *rsb, /* in/out: summary block number */ xfs_suminfo_t *sum) /* out: summary info for this block */ { - xfs_buf_t *bp; /* buffer for summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information & copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sum = *sp; - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); - return 0; + return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); } - /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index c642795324af..76c0a4a9bb17 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t *rtblock); int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, + int log, xfs_rtblock_t bbno, int delta, + xfs_buf_t **rbpp, xfs_fsblock_t *rsb, + xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, xfs_fsblock_t *rsb); From ab6978c295b074eb2ba4b06fdf206c7ab4f293e5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Sep 2014 11:59:12 +1000 Subject: [PATCH 11/56] xfs: remove rbpp check from xfs_rtmodify_summary_int rbpp is always passed into xfs_rtmodify_summary and xfs_rtget_summary, so there is no need to test for it in xfs_rtmodify_summary_int. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_rtbitmap.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 50e3b9302722..7c818f1e4484 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -460,7 +460,7 @@ xfs_rtmodify_summary_int( /* * If we have an old buffer, and the block number matches, use that. */ - if (rbpp && *rbpp && *rsb == sb) + if (*rbpp && *rsb == sb) bp = *rbpp; /* * Otherwise we have to get the buffer. @@ -469,7 +469,7 @@ xfs_rtmodify_summary_int( /* * If there was an old one, get rid of it first. */ - if (rbpp && *rbpp) + if (*rbpp) xfs_trans_brelse(tp, *rbpp); error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); if (error) { @@ -478,10 +478,8 @@ xfs_rtmodify_summary_int( /* * Remember this buffer and block for the next call. */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } + *rbpp = bp; + *rsb = sb; } /* * Point to the summary information, modify/log it, and/or copy it out. @@ -493,14 +491,8 @@ xfs_rtmodify_summary_int( *sp += delta; xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); } - if (sum) { - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); + if (sum) *sum = *sp; - } return 0; } From 0d085a529b427d97710e6a41f8a4f23e1757cd12 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Sep 2014 15:36:27 +1000 Subject: [PATCH 12/56] xfs: ensure WB_SYNC_ALL writeback handles partial pages correctly XFS has been having trouble with stray delayed allocation extents beyond EOF for a long time. Recent changes to the collapse range code has triggered erroneous EBUSY errors on page invalidtion for block size smaller than page size filesystems. These have been caused by dirty buffers beyond EOF on a partial page which do not get written to disk during a sync. The issue is that write-ahead in xfs_cluster_write() finds such a partial page and handles it by leaving the page dirty but pushing it into a writeback state. This used to work just fine, as the write_cache_pages() code would then find the dirty partial page in the next mapping tree lookup as the dirty tag is still set. Unfortunately, when we moved to a mark and sweep approach to writeback to fix other writeback sync issues, we broken this. THe act of marking the page as under writeback now clears the TOWRITE tag in the radix tree, even though the page is still dirty. This causes the TOWRITE tag to be cleared, and hence the next lookup on the mapping tree does not find the dirty partial page and so doesn't try to write it again. This same writeback bug was found recently in ext4 and fixed in commit 1c8349a ("ext4: fix data integrity sync in ordered mode") without communication to the wider filesystem community. We can use exactly the same fix here so the TOWRITE flag is not cleared on partial page writes. cc: stable@vger.kernel.org # dependent on 1c8349a17137b93f0a83f276c764a6df1b9a116e Root-cause-found-by: Brian Foster Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b984647c24db..2f502537a39c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -434,10 +434,22 @@ xfs_start_page_writeback( { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); - if (clear_dirty) + + /* + * if the page was not fully cleaned, we need to ensure that the higher + * layers come back to it correctly. That means we need to keep the page + * dirty, and for WB_SYNC_ALL writeback we need to ensure the + * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to + * write this page in this writeback sweep will be made. + */ + if (clear_dirty) { clear_page_dirty_for_io(page); - set_page_writeback(page); + set_page_writeback(page); + } else + set_page_writeback_keepwrite(page); + unlock_page(page); + /* If no buffers on the page are to be written, finish it here */ if (!buffers) end_page_writeback(page); From 2c845f5a5f238f42376b6551a7f7716952c8f509 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 23 Sep 2014 15:37:09 +1000 Subject: [PATCH 13/56] xfs: track collapse via file offset rather than extent index The collapse range implementation uses a transaction per extent shift. The progress of the overall operation is tracked via the current extent index of the in-core extent list. This is racy because the ilock must be dropped and reacquired for each transaction according to locking and log reservation rules. Therefore, writeback to prior regions of the file is possible and can change the extent count. This changes the extent to which the current index refers and causes the collapse to fail mid operation. To avoid this problem, the entire file is currently written back before the collapse operation starts. To eliminate the need to flush the entire file, use the file offset (fsb) to track the progress of the overall extent shift operation rather than the extent index. Modify xfs_bmap_shift_extents() to unconditionally convert the start_fsb parameter to an extent index and return the file offset of the extent where the shift left off, if further extents exist. The bulk of ths function can remain based on extent index as ilock is held by the caller. xfs_collapse_file_space() now uses the fsb output as the starting point for the subsequent shift. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 85 +++++++++++++++++++++------------------- fs/xfs/libxfs/xfs_bmap.h | 7 ++-- fs/xfs/xfs_bmap_util.c | 12 +++--- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 86df952d3e24..4b3f1b92cddd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5406,20 +5406,21 @@ error0: /* * Shift extent records to the left to cover a hole. * - * The maximum number of extents to be shifted in a single operation - * is @num_exts, and @current_ext keeps track of the current extent - * index we have shifted. @offset_shift_fsb is the length by which each - * extent is shifted. If there is no hole to shift the extents - * into, this will be considered invalid operation and we abort immediately. + * The maximum number of extents to be shifted in a single operation is + * @num_exts. @start_fsb specifies the file offset to start the shift and the + * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb + * is the length by which each extent is shifted. If there is no hole to shift + * the extents into, this will be considered invalid operation and we abort + * immediately. */ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - int *done, xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - xfs_extnum_t *current_ext, + int *done, + xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, int num_exts) @@ -5431,6 +5432,7 @@ xfs_bmap_shift_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; + xfs_extnum_t current_ext; xfs_fileoff_t startoff; int error = 0; int i; @@ -5451,7 +5453,8 @@ xfs_bmap_shift_extents( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ASSERT(current_ext != NULL); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5462,20 +5465,18 @@ xfs_bmap_shift_extents( } /* - * If *current_ext is 0, we would need to lookup the extent - * from where we would start shifting and store it in gotp. + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * start_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. */ - if (!*current_ext) { - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); - /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) start_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. - */ - if (!gotp) { - *done = 1; - return 0; - } + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + return 0; } if (ifp->if_flags & XFS_IFBROOT) { @@ -5487,36 +5488,32 @@ xfs_bmap_shift_extents( /* * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot - * use the count of real extents here. Instead we have to calculate it - * from the incore fork. + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && *current_ext < total_extents) { + while (nexts++ < num_exts && current_ext < total_extents) { - gotp = xfs_iext_get_ext(ifp, *current_ext); + gotp = xfs_iext_get_ext(ifp, current_ext); xfs_bmbt_get_all(gotp, &got); startoff = got.br_startoff - offset_shift_fsb; /* - * Before shifting extent into hole, make sure that the hole - * is large enough to accomodate the shift. + * Before shifting extent into hole, make sure that the hole is + * large enough to accommodate the shift. */ - if (*current_ext) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - *current_ext - 1), &left); - + if (current_ext > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, current_ext - 1), + &left); if (startoff < left.br_startoff + left.br_blockcount) error = -EINVAL; } else if (offset_shift_fsb > got.br_startoff) { /* - * When first extent is shifted, offset_shift_fsb - * should be less than the stating offset of - * the first extent. + * When first extent is shifted, offset_shift_fsb should + * be less than the stating offset of the first extent. */ error = -EINVAL; } - if (error) goto del_cursor; @@ -5531,7 +5528,7 @@ xfs_bmap_shift_extents( } /* Check if we can merge 2 adjacent extents */ - if (*current_ext && + if (current_ext && left.br_startoff + left.br_blockcount == startoff && left.br_startblock + left.br_blockcount == got.br_startblock && @@ -5539,7 +5536,7 @@ xfs_bmap_shift_extents( left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { blockcount = left.br_blockcount + got.br_blockcount; - xfs_iext_remove(ip, *current_ext, 1, 0); + xfs_iext_remove(ip, current_ext, 1, 0); logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_btree_delete(cur, &i); @@ -5551,7 +5548,7 @@ xfs_bmap_shift_extents( } XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - gotp = xfs_iext_get_ext(ifp, --*current_ext); + gotp = xfs_iext_get_ext(ifp, --current_ext); xfs_bmbt_get_all(gotp, &got); /* Make cursor point to the extent we will update */ @@ -5585,13 +5582,18 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - (*current_ext)++; + current_ext++; total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == total_extents) + if (current_ext == total_extents) *done = 1; + else if (next_fsb) { + gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + } del_cursor: if (cur) @@ -5600,5 +5602,6 @@ del_cursor: if (logflags) xfs_trans_log_inode(tp, ip, logflags); + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b879ca56a64c..44db6db86402 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -178,9 +178,8 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - int *done, xfs_fileoff_t start_fsb, - xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, - xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, - int num_exts); + xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1707980f9a4b..1e96d778bb9e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1456,18 +1456,18 @@ xfs_collapse_file_space( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; - xfs_extnum_t current_ext = 0; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; xfs_fileoff_t start_fsb; + xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); trace_xfs_collapse_file_space(ip); - start_fsb = XFS_B_TO_FSB(mp, offset + len); + next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); /* @@ -1525,10 +1525,10 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, - shift_fsb, ¤t_ext, - &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + start_fsb = next_fsb; + error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, + &done, &next_fsb, &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; From ddb19e3180fa42362a04e86771d758be1de0bb13 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 23 Sep 2014 15:38:09 +1000 Subject: [PATCH 14/56] xfs: refactor shift-by-merge into xfs_bmse_merge() helper The extent shift mechanism in xfs_bmap_shift_extents() is complicated and handles several different, non-deterministic scenarios. These include extent shifts, extent merges and potential btree updates in either of the former scenarios. Refactor the code to be more linear and readable. The loop logic in xfs_bmap_shift_extents() and some initial error checking is adjusted slightly. The associated btree lookup and update/delete operations are condensed into single blocks of code. This reduces the number of btree-specific blocks and facilitates the separation of the merge operation into a new xfs_bmse_merge() and xfs_bmse_can_merge() helpers. This is a code refactor only. The behavior of extent shift and collapse range is not modified. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 246 +++++++++++++++++++++++++++------------ 1 file changed, 169 insertions(+), 77 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4b3f1b92cddd..532c4aab20f2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5403,6 +5403,120 @@ error0: return error; } +/* + * Determine whether an extent shift can be accomplished by a merge with the + * extent that precedes the target hole of the shift. + */ +STATIC bool +xfs_bmse_can_merge( + struct xfs_bmbt_irec *left, /* preceding extent */ + struct xfs_bmbt_irec *got, /* current extent to shift */ + xfs_fileoff_t shift) /* shift fsb */ +{ + xfs_fileoff_t startoff; + + startoff = got->br_startoff - shift; + + /* + * The extent, once shifted, must be adjacent in-file and on-disk with + * the preceding extent. + */ + if ((left->br_startoff + left->br_blockcount != startoff) || + (left->br_startblock + left->br_blockcount != got->br_startblock) || + (left->br_state != got->br_state) || + (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + return false; + + return true; +} + +/* + * A bmap extent shift adjusts the file offset of an extent to fill a preceding + * hole in the file. If an extent shift would result in the extent being fully + * adjacent to the extent that currently precedes the hole, we can merge with + * the preceding extent rather than do the shift. + * + * This function assumes the caller has verified a shift-by-merge is possible + * with the provided extents via xfs_bmse_can_merge(). + */ +STATIC int +xfs_bmse_merge( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t shift, /* shift fsb */ + int current_ext, /* idx of gotp */ + struct xfs_bmbt_rec_host *gotp, /* extent to shift */ + struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_btree_cur *cur, + int *logflags) /* output */ +{ + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + xfs_filblks_t blockcount; + int error, i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmbt_get_all(gotp, &got); + xfs_bmbt_get_all(leftp, &left); + blockcount = left.br_blockcount + got.br_blockcount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + + /* + * Merge the in-core extents. Note that the host record pointers and + * current_ext index are invalid once the extent has been removed via + * xfs_iext_remove(). + */ + xfs_bmbt_set_blockcount(leftp, blockcount); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* + * Update the on-disk extent count, the btree if necessary and log the + * inode. + */ + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + /* lookup and remove the extent to merge */ + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + /* lookup and update size of the previous extent */ + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + left.br_blockcount = blockcount; + + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); + if (error) + goto out_error; + + return 0; + +out_error: + return error; +} + /* * Shift extent records to the left to cover a hole. * @@ -5427,6 +5541,7 @@ xfs_bmap_shift_extents( { struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_rec_host *leftp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec left; struct xfs_mount *mp = ip->i_mount; @@ -5438,7 +5553,6 @@ xfs_bmap_shift_extents( int i; int whichfork = XFS_DATA_FORK; int logflags = 0; - xfs_filblks_t blockcount = 0; int total_extents; if (unlikely(XFS_TEST_ERROR( @@ -5464,6 +5578,13 @@ xfs_bmap_shift_extents( return error; } + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } + /* * Look up the extent index for the fsb where we start shifting. We can * henceforth iterate with current_ext as extent list changes are locked @@ -5476,14 +5597,17 @@ xfs_bmap_shift_extents( gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); if (!gotp) { *done = 1; - return 0; + goto del_cursor; } + xfs_bmbt_get_all(gotp, &got); - if (ifp->if_flags & XFS_IFBROOT) { - cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = 0; + /* + * If the first extent is shifted, offset_shift_fsb cannot be larger + * than the starting offset of the first extent. + */ + if (current_ext == 0 && got.br_startoff < offset_shift_fsb) { + error = -EINVAL; + goto del_cursor; } /* @@ -5493,30 +5617,41 @@ xfs_bmap_shift_extents( */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); while (nexts++ < num_exts && current_ext < total_extents) { - - gotp = xfs_iext_get_ext(ifp, current_ext); - xfs_bmbt_get_all(gotp, &got); startoff = got.br_startoff - offset_shift_fsb; - /* - * Before shifting extent into hole, make sure that the hole is - * large enough to accommodate the shift. - */ + /* grab the left extent and check for a potential merge */ if (current_ext > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, current_ext - 1), - &left); - if (startoff < left.br_startoff + left.br_blockcount) - error = -EINVAL; - } else if (offset_shift_fsb > got.br_startoff) { - /* - * When first extent is shifted, offset_shift_fsb should - * be less than the stating offset of the first extent. - */ - error = -EINVAL; - } - if (error) - goto del_cursor; + leftp = xfs_iext_get_ext(ifp, current_ext - 1); + xfs_bmbt_get_all(leftp, &left); + /* make sure hole is large enough for shift */ + if (startoff < left.br_startoff + left.br_blockcount) { + error = -EINVAL; + goto del_cursor; + } + + if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + error = xfs_bmse_merge(ip, whichfork, + offset_shift_fsb, current_ext, gotp, + leftp, cur, &logflags); + if (error) + goto del_cursor; + + /* + * The extent was merged so adjust the extent + * index and move onto the next. + */ + current_ext--; + goto next; + } + } + + /* + * We didn't merge the extent so do the shift. Update the start + * offset in the in-core extent and btree, if necessary. + */ + xfs_bmbt_set_startoff(gotp, startoff); + logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, @@ -5525,53 +5660,8 @@ xfs_bmap_shift_extents( if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } - /* Check if we can merge 2 adjacent extents */ - if (current_ext && - left.br_startoff + left.br_blockcount == startoff && - left.br_startblock + left.br_blockcount == - got.br_startblock && - left.br_state == got.br_state && - left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { - blockcount = left.br_blockcount + - got.br_blockcount; - xfs_iext_remove(ip, current_ext, 1, 0); - logflags |= XFS_ILOG_CORE; - if (cur) { - error = xfs_btree_delete(cur, &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } else { - logflags |= XFS_ILOG_DEXT; - } - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - gotp = xfs_iext_get_ext(ifp, --current_ext); - xfs_bmbt_get_all(gotp, &got); - - /* Make cursor point to the extent we will update */ - if (cur) { - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } - - xfs_bmbt_set_blockcount(gotp, blockcount); - got.br_blockcount = blockcount; - } else { - /* We have to update the startoff */ - xfs_bmbt_set_startoff(gotp, startoff); got.br_startoff = startoff; - } - - logflags |= XFS_ILOG_CORE; - if (cur) { error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, got.br_blockcount, @@ -5582,18 +5672,20 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - current_ext++; +next: + /* update total extent count and grab the next record */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (++current_ext >= total_extents) + break; + gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_bmbt_get_all(gotp, &got); } /* Check if we are done */ if (current_ext == total_extents) *done = 1; - else if (next_fsb) { - gotp = xfs_iext_get_ext(ifp, current_ext); - xfs_bmbt_get_all(gotp, &got); + else if (next_fsb) *next_fsb = got.br_startoff; - } del_cursor: if (cur) From a979bdfea10a61dce0055b4d416d640f4f5f495e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 23 Sep 2014 15:39:04 +1000 Subject: [PATCH 15/56] xfs: refactor single extent shift into xfs_bmse_shift_one() helper xfs_bmap_shift_extents() has a variety of conditions and error checks that make the logic difficult to follow and indent heavy. Refactor the loop body of this function into a new xfs_bmse_shift_one() helper. This simplifies the error checks, eliminates index decrement on merge hack by pushing the index increment down into the helper, and makes the code more readable by reducing multiple levels of indentation. This is a code refactor only. The behavior of extent shift and collapse range is not modified. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 164 ++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 73 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 532c4aab20f2..69bf8d868b64 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5517,6 +5517,88 @@ out_error: return error; } +/* + * Shift a single extent. + */ +STATIC int +xfs_bmse_shift_one( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_shift_fsb, + int *current_ext, + struct xfs_bmbt_rec_host *gotp, + struct xfs_btree_cur *cur, + int *logflags) +{ + struct xfs_ifork *ifp; + xfs_fileoff_t startoff; + struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + int error; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; + + /* + * If this is the first extent in the file, make sure there's enough + * room at the start of the file and jump right to the shift as there's + * no left extent to merge. + */ + if (*current_ext == 0) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto shift_extent; + } + + /* grab the left extent and check for a large enough hole */ + leftp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(leftp, &left); + + if (startoff < left.br_startoff + left.br_blockcount) + return -EINVAL; + + /* check whether to merge the extent or shift it down */ + if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) + goto shift_extent; + + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext, + gotp, leftp, cur, logflags); + +shift_extent: + /* + * Increment the extent index for the next iteration, update the start + * offset of the in-core extent and update the btree if applicable. + */ + (*current_ext)++; + xfs_bmbt_set_startoff(gotp, startoff); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + got.br_startoff = startoff; + error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); + if (error) + return error; + + return 0; + +out_error: + return error; +} + /* * Shift extent records to the left to cover a hole. * @@ -5541,16 +5623,12 @@ xfs_bmap_shift_extents( { struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_rec_host *gotp; - struct xfs_bmbt_rec_host *leftp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; xfs_extnum_t current_ext; - xfs_fileoff_t startoff; int error = 0; - int i; int whichfork = XFS_DATA_FORK; int logflags = 0; int total_extents; @@ -5599,16 +5677,6 @@ xfs_bmap_shift_extents( *done = 1; goto del_cursor; } - xfs_bmbt_get_all(gotp, &got); - - /* - * If the first extent is shifted, offset_shift_fsb cannot be larger - * than the starting offset of the first extent. - */ - if (current_ext == 0 && got.br_startoff < offset_shift_fsb) { - error = -EINVAL; - goto del_cursor; - } /* * There may be delalloc extents in the data fork before the range we @@ -5617,75 +5685,25 @@ xfs_bmap_shift_extents( */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); while (nexts++ < num_exts && current_ext < total_extents) { - startoff = got.br_startoff - offset_shift_fsb; + error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, + ¤t_ext, gotp, cur, &logflags); + if (error) + goto del_cursor; - /* grab the left extent and check for a potential merge */ - if (current_ext > 0) { - leftp = xfs_iext_get_ext(ifp, current_ext - 1); - xfs_bmbt_get_all(leftp, &left); - - /* make sure hole is large enough for shift */ - if (startoff < left.br_startoff + left.br_blockcount) { - error = -EINVAL; - goto del_cursor; - } - - if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { - error = xfs_bmse_merge(ip, whichfork, - offset_shift_fsb, current_ext, gotp, - leftp, cur, &logflags); - if (error) - goto del_cursor; - - /* - * The extent was merged so adjust the extent - * index and move onto the next. - */ - current_ext--; - goto next; - } - } - - /* - * We didn't merge the extent so do the shift. Update the start - * offset in the in-core extent and btree, if necessary. - */ - xfs_bmbt_set_startoff(gotp, startoff); - logflags |= XFS_ILOG_CORE; - if (cur) { - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - - got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state); - if (error) - goto del_cursor; - } else { - logflags |= XFS_ILOG_DEXT; - } - -next: /* update total extent count and grab the next record */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (++current_ext >= total_extents) + if (current_ext >= total_extents) break; gotp = xfs_iext_get_ext(ifp, current_ext); - xfs_bmbt_get_all(gotp, &got); } /* Check if we are done */ - if (current_ext == total_extents) + if (current_ext == total_extents) { *done = 1; - else if (next_fsb) + } else if (next_fsb) { + xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; + } del_cursor: if (cur) From f71721d061e872a39b2680d13f309c1eb6893438 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 23 Sep 2014 15:39:05 +1000 Subject: [PATCH 16/56] xfs: writeback and inval. file range to be shifted by collapse The collapse range operation currently writes the entire file before starting the collapse to avoid changes in the in-core extent list due to writeback causing the extent count to change. Now that collapse range is fsb based rather than extent index based it can sustain changes in the extent list during the shift sequence without disruption. Modify xfs_collapse_file_space() to writeback and invalidate pages associated with the range of the file to be shifted. xfs_free_file_space() currently has similar behavior, but the space free need only affect the region of the file that is freed and this could change in the future. Also update the comments to reflect the current implementation. We retain the eofblocks trim permanently as a best option for dealing with delalloc extents. We don't shift delalloc extents because this scenario only occurs with post-eof preallocation (since data must be flushed such that the cache can be invalidated and data can be shifted). That means said space must also be initialized before being shifted into the accessible region of the file only to be immediately truncated off as the last part of the collapse. In other words, the eofblocks trim will happen anyways, we just run it first to ensure the file remains in a consistent state throughout the collapse. Finally, detect and fail explicitly in the event of a delalloc extent during the extent shift. The implementation does not support delalloc extents and the caller is expected to prevent this scenario in advance as is done by collapse. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap_util.c | 32 +++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 69bf8d868b64..79c981984dca 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5543,6 +5543,10 @@ xfs_bmse_shift_one( xfs_bmbt_get_all(gotp, &got); startoff = got.br_startoff - offset_shift_fsb; + /* delalloc extents should be prevented by caller */ + XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock), + out_error); + /* * If this is the first extent in the file, make sure there's enough * room at the start of the file and jump right to the shift as there's diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1e96d778bb9e..eae763f4600e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1470,27 +1470,33 @@ xfs_collapse_file_space( next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - /* - * Writeback the entire file and force remove any post-eof blocks. The - * writeback prevents changes to the extent list via concurrent - * writeback and the eofblocks trim prevents the extent shift algorithm - * from running into a post-eof delalloc extent. - * - * XXX: This is a temporary fix until the extent shift loop below is - * converted to use offsets and lookups within the ILOCK rather than - * carrying around the index into the extent list for the next - * iteration. - */ - error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = xfs_free_file_space(ip, offset, len); if (error) return error; + + /* + * Trim eofblocks to avoid shifting uninitialized post-eof preallocation + * into the accessible region of the file. + */ if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return error; } - error = xfs_free_file_space(ip, offset, len); + /* + * Writeback and invalidate cache for the remainder of the file as we're + * about to shift down every extent from the collapse range to EOF. The + * free of the collapse range above might have already done some of + * this, but we shouldn't rely on it to do anything outside of the range + * that was freed. + */ + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + offset + len, -1); + if (error) + return error; + error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + (offset + len) >> PAGE_CACHE_SHIFT, -1); if (error) return error; From 8b5279e33f241a074a9c8649bba8f77a2167b798 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 23 Sep 2014 15:39:05 +1000 Subject: [PATCH 17/56] xfs: only writeback and truncate pages for the freed range xfs_free_file_space() only affects the range of the file for which space is being freed. It currently writes and truncates the page cache from the start offset of the free to EOF. Modify xfs_free_file_space() to write back and truncate page cache of just the range being freed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index eae763f4600e..809ae7d395c3 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1205,6 +1205,7 @@ xfs_free_file_space( xfs_bmap_free_t free_list; xfs_bmbt_irec_t imap; xfs_off_t ioffset; + xfs_off_t iendoffset; xfs_extlen_t mod=0; xfs_mount_t *mp; int nimap; @@ -1233,12 +1234,13 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - ioffset = offset & ~(rounding - 1); - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ioffset, -1); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, + iendoffset); if (error) goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, -1); + truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); /* * Need to zero the stuff we're not freeing, on disk. From 8af3dcd3c89aef10375bdd79d5f336b22b57487c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Sep 2014 15:57:59 +1000 Subject: [PATCH 18/56] xfs: xlog_cil_force_lsn doesn't always wait correctly When running a tight mount/unmount loop on an older kernel, RedHat QE found that unmount would occasionally hang in xfs_buf_unpin_wait() on the superblock buffer. Tracing and other debug work by Eric Sandeen indicated that it was hanging on the writing of the superblock during unmount immediately after logging the superblock counters in a synchronous transaction. Further debug indicated that the synchronous transaction was not waiting for completion correctly, and we narrowed it down to xlog_cil_force_lsn() returning NULLCOMMITLSN and hence not pushing the transaction in the iclog buffer to disk correctly. While this unmount superblock write code is now very different in mainline kernels, the xlog_cil_force_lsn() code is identical, and it was bisected to the backport of commit f876e44 ("xfs: always do log forces via the workqueue"). This commit made the CIL push asynchronous for log forces and hence exposed a race condition that couldn't occur on a synchronous push. Essentially, the xlog_cil_force_lsn() relied implicitly on the fact that the sequence push would be complete by the time xlog_cil_push_now() returned, resulting in the context being pushed being in the committing list. When it was made asynchronous, it was recognised that there was a race condition in detecting whether an asynchronous push has started or not and code was added to handle it. Unfortunately, the fix was not quite right and left a race condition where it it would detect an empty CIL while a push was in progress before the context had been added to the committing list. This was incorrectly seen as a "nothing to do" condition and so would tell xfs_log_force_lsn() that there is nothing to wait for, and hence it would push the iclogbufs in memory. The fix is simple, but explaining the logic and the race condition is a lot more complex. The fix is to add the context to the committing list before we start emptying the CIL. This allows us to detect the difference between an empty "do nothing" push and a push that has not started by adding a discrete "emptying the CIL" state to avoid the transient, incorrect "empty" condition that the (unchanged) waiting code was seeing. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 47 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f6b79e5325dd..f506c457011e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -463,12 +463,40 @@ xlog_cil_push( spin_unlock(&cil->xc_push_lock); goto out_skip; } - spin_unlock(&cil->xc_push_lock); /* check for a previously pushed seqeunce */ - if (push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) { + spin_unlock(&cil->xc_push_lock); goto out_skip; + } + + /* + * We are now going to push this context, so add it to the committing + * list before we do anything else. This ensures that anyone waiting on + * this push can easily detect the difference between a "push in + * progress" and "CIL is empty, nothing to do". + * + * IOWs, a wait loop can now check for: + * the current sequence not being found on the committing list; + * an empty CIL; and + * an unchanged sequence number + * to detect a push that had nothing to do and therefore does not need + * waiting on. If the CIL is not empty, we get put on the committing + * list before emptying the CIL and bumping the sequence number. Hence + * an empty CIL and an unchanged sequence number means we jumped out + * above after doing nothing. + * + * Hence the waiter will either find the commit sequence on the + * committing list or the sequence number will be unchanged and the CIL + * still dirty. In that latter case, the push has not yet started, and + * so the waiter will have to continue trying to check the CIL + * committing list until it is found. In extreme cases of delay, the + * sequence may fully commit between the attempts the wait makes to wait + * on the commit sequence. + */ + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_push_lock); /* * pull all the log vectors off the items in the CIL, and @@ -532,7 +560,6 @@ xlog_cil_push( */ spin_lock(&cil->xc_push_lock); cil->xc_current_sequence = new_ctx->sequence; - list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -855,13 +882,15 @@ restart: * Hence by the time we have got here it our sequence may not have been * pushed yet. This is true if the current sequence still matches the * push sequence after the above wait loop and the CIL still contains - * dirty objects. + * dirty objects. This is guaranteed by the push code first adding the + * context to the committing list before emptying the CIL. * - * When the push occurs, it will empty the CIL and atomically increment - * the currect sequence past the push sequence and move it into the - * committing list. Of course, if the CIL is clean at the time of the - * push, it won't have pushed the CIL at all, so in that case we should - * try the push for this sequence again from the start just in case. + * Hence if we don't find the context in the committing list and the + * current sequence number is unchanged then the CIL contents are + * significant. If the CIL is empty, if means there was nothing to push + * and that means there is nothing to wait for. If the CIL is not empty, + * it means we haven't yet started the push, because if it had started + * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && !list_empty(&cil->xc_cil)) { From fb040131561a6b34cefb92cdf598218104abeda0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 23 Sep 2014 16:05:32 +1000 Subject: [PATCH 19/56] xfs: don't ASSERT on corrupt ftype xfs_dir3_data_get_ftype() gets the file type off disk, but ASSERTs if it's invalid: ASSERT(type < XFS_DIR3_FT_MAX); We shouldn't ASSERT on bad values read from disk. V3 dirs are CRC-protected, but V2 dirs + ftype are not. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_format.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index c9aee52a37e2..7e42fdfd2f1d 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype( { __uint8_t ftype = dep->name[dep->namelen]; - ASSERT(ftype < XFS_DIR3_FT_MAX); if (ftype >= XFS_DIR3_FT_MAX) return XFS_DIR3_FT_UNKNOWN; return ftype; From e3cf17962a757e59fed2cbcbda6373c1b35a35dd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 23 Sep 2014 16:05:55 +1000 Subject: [PATCH 20/56] xfs: remove second xfs_quota.h inclusion in xfs_icache.c xfs_quota.h was included twice. Signed-off-by: Fabian Frederick Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 981b2cf51985..b45f7b27b5df 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,7 +33,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" -#include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" From ea95961df714f7fc446aa4bedfc61510ed1b59cc Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 23 Sep 2014 16:11:43 +1000 Subject: [PATCH 21/56] xfs: xfs_rtget_summary can be static Fix sparse warning introduced by commit afabfd3 ("xfs: combine xfs_rtmodify_summary and xfs_rtget_summary"). Signed-off-by: Fengguang Wu Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d1160cce5dad..d45aebe04dde 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -46,7 +46,7 @@ * Keeps track of a current summary block, so we don't keep reading * it from the buffer cache. */ -int +static int xfs_rtget_summary( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ From 02cc18764c753befcdc163d1bc668a6599a54585 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Sep 2014 16:15:45 +1000 Subject: [PATCH 22/56] xfs: xfs_buf_write_fail_rl_state can be static Fix sparse warning introduced by commit ac8809f9 ("xfs: abort metadata writeback on permanent errors"). Signed-off-by: Fengguang Wu Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 76007deed31f..30fa5db9aea8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -501,7 +501,7 @@ xfs_buf_item_unpin( * buffer being bad.. */ -DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); +static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); STATIC uint xfs_buf_item_push( From 7abbb8f928e5b7cea1edd077131b2ace665c6712 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Sep 2014 16:20:11 +1000 Subject: [PATCH 23/56] xfs: xfs_swap_extent_flush can be static Fix sparse warning introduced by commit 4ef897a ("xfs: flush both inodes in xfs_swap_extents"). Signed-off-by: Fengguang Wu Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2f1e30d39a35..1cb345e7c181 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1618,7 +1618,7 @@ xfs_swap_extents_check_format( return 0; } -int +static int xfs_swap_extent_flush( struct xfs_inode *ip) { From 2ebff7bbd785c86e12956388b9e6f6bb8ea5d21e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Sep 2014 22:55:00 +1000 Subject: [PATCH 24/56] xfs: flush entire last page of old EOF on truncate up On a sub-page sized filesystem, truncating a mapped region down leaves us in a world of hurt. We truncate the pagecache, zeroing the newly unused tail, then punch blocks out from under the page. If we then truncate the file back up immediately, we expose that unmapped hole to a dirty page mapped into the user application, and that's where it all goes wrong. In truncating the page cache, we avoid unmapping the tail page of the cache because it still contains valid data. The problem is that it also contains a hole after the truncate, but nobody told the mm subsystem that. Therefore, if the page is dirty before the truncate, we'll never get a .page_mkwrite callout after we extend the file and the application writes data into the hole on the page. Hence when we come to writing that region of the page, it has no blocks and no delayed allocation reservation and hence we toss the data away. This patch adds code to the truncate up case to solve it, by ensuring the partial page at the old EOF is always cleaned after we do any zeroing and move the EOF upwards. We can't actually serialise the page writeback and truncate against page faults (yes, that problem AGAIN) so this is really just a best effort and assumes it is extremely unlikely that someone is concurrently writing to the page at the EOF while extending the file. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 72129493e9d3..ec6dcdc181ee 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -849,6 +849,36 @@ xfs_setattr_size( return error; truncate_setsize(inode, newsize); + /* + * The "we can't serialise against page faults" pain gets worse. + * + * If the file is mapped then we have to clean the page at the old EOF + * when extending the file. Extending the file can expose changes the + * underlying page mapping (e.g. from beyond EOF to a hole or + * unwritten), and so on the next attempt to write to that page we need + * to remap it for write. i.e. we need .page_mkwrite() to be called. + * Hence we need to clean the page to clean the pte and so a new write + * fault will be triggered appropriately. + * + * If we do it before we change the inode size, then we can race with a + * page fault that maps the page with exactly the same problem. If we do + * it after we change the file size, then a new page fault can come in + * and allocate space before we've run the rest of the truncate + * transaction. That's kinda grotesque, but it's better than have data + * over a hole, and so that's the lesser evil that has been chosen here. + * + * The real solution, however, is to have some mechanism for locking out + * page faults while a truncate is in progress. + */ + if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { + error = filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + round_down(oldsize, PAGE_CACHE_SIZE), + round_up(oldsize, PAGE_CACHE_SIZE) - 1); + if (error) + return error; + } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) From eeb1168810d8a140f6834f8c4975f7bb3277d790 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 09:45:03 +1000 Subject: [PATCH 25/56] xfs: refactor xlog_recover_process_data() Clean up xlog_recover_process_data() structure in preparation for fixing the allocation and freeing context of the transaction being recovered. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 208 +++++++++++++++++++++++++-------------- 1 file changed, 132 insertions(+), 76 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fd5787add99..8105b8571979 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3535,12 +3535,124 @@ out: } STATIC int -xlog_recover_unmount_trans( - struct xlog *log) +xlog_recovery_process_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + unsigned int len, + unsigned int flags, + int pass) { - /* Do nothing now */ - xfs_warn(log->l_mp, "%s: Unmount LR", __func__); - return 0; + int error = -EIO; + + /* mask off ophdr transaction container flags */ + flags &= ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + + switch (flags) { + /* expected flag values */ + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(log, trans, dp, len); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(log, trans, dp, len); + break; + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, trans, pass); + break; + + /* unexpected flag values */ + case XLOG_UNMOUNT_TRANS: + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + error = 0; /* just skip trans */ + break; + case XLOG_START_TRANS: + xfs_warn(log->l_mp, "%s: bad transaction", __func__); + ASSERT(0); + break; + default: + xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); + ASSERT(0); + break; + } + return error; +} + +STATIC struct xlog_recover * +xlog_recover_ophdr_to_trans( + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead) +{ + struct xlog_recover *trans; + xlog_tid_t tid; + struct hlist_head *rhp; + + tid = be32_to_cpu(ohead->oh_tid); + rhp = &rhash[XLOG_RHASH(tid)]; + trans = xlog_recover_find_tid(rhp, tid); + if (trans) + return trans; + + /* + * If this is a new transaction, the ophdr only contains the + * start record. In that case, the only processing we need to do + * on this opheader is allocate a new recovery container to hold + * the recovery ops that will follow. + */ + if (ohead->oh_flags & XLOG_START_TRANS) { + ASSERT(be32_to_cpu(ohead->oh_len) == 0); + xlog_recover_new_tid(rhp, tid, be64_to_cpu(rhead->h_lsn)); + } + return NULL; +} + +STATIC int +xlog_recover_process_ophdr( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead, + xfs_caddr_t dp, + xfs_caddr_t end, + int pass) +{ + struct xlog_recover *trans; + int error; + unsigned int len; + + /* Do we understand who wrote this op? */ + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); + ASSERT(0); + return -EIO; + } + + /* + * Check the ophdr contains all the data it is supposed to contain. + */ + len = be32_to_cpu(ohead->oh_len); + if (dp + len > end) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); + WARN_ON(1); + return -EIO; + } + + trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); + if (!trans) { + /* nothing to do, so skip over this ophdr */ + return 0; + } + + error = xlog_recovery_process_trans(log, trans, dp, len, + ohead->oh_flags, pass); + if (error) + xlog_recover_free_trans(trans); + return error; } /* @@ -3560,86 +3672,30 @@ xlog_recover_process_data( xfs_caddr_t dp, int pass) { - xfs_caddr_t lp; + struct xlog_op_header *ohead; + xfs_caddr_t end; int num_logops; - xlog_op_header_t *ohead; - xlog_recover_t *trans; - xlog_tid_t tid; int error; - unsigned long hash; - uint flags; - lp = dp + be32_to_cpu(rhead->h_len); + end = dp + be32_to_cpu(rhead->h_len); num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; - while ((dp < lp) && num_logops) { - ASSERT(dp + sizeof(xlog_op_header_t) <= lp); - ohead = (xlog_op_header_t *)dp; - dp += sizeof(xlog_op_header_t); - if (ohead->oh_clientid != XFS_TRANSACTION && - ohead->oh_clientid != XFS_LOG) { - xfs_warn(log->l_mp, "%s: bad clientid 0x%x", - __func__, ohead->oh_clientid); - ASSERT(0); - return -EIO; - } - tid = be32_to_cpu(ohead->oh_tid); - hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(&rhash[hash], tid); - if (trans == NULL) { /* not found; add new tid */ - if (ohead->oh_flags & XLOG_START_TRANS) - xlog_recover_new_tid(&rhash[hash], tid, - be64_to_cpu(rhead->h_lsn)); - } else { - if (dp + be32_to_cpu(ohead->oh_len) > lp) { - xfs_warn(log->l_mp, "%s: bad length 0x%x", - __func__, be32_to_cpu(ohead->oh_len)); - WARN_ON(1); - return -EIO; - } - flags = ohead->oh_flags & ~XLOG_END_TRANS; - if (flags & XLOG_WAS_CONT_TRANS) - flags &= ~XLOG_CONTINUE_TRANS; - switch (flags) { - case XLOG_COMMIT_TRANS: - error = xlog_recover_commit_trans(log, - trans, pass); - break; - case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log); - break; - case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(log, - trans, dp, - be32_to_cpu(ohead->oh_len)); - break; - case XLOG_START_TRANS: - xfs_warn(log->l_mp, "%s: bad transaction", - __func__); - ASSERT(0); - error = -EIO; - break; - case 0: - case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(log, trans, - dp, be32_to_cpu(ohead->oh_len)); - break; - default: - xfs_warn(log->l_mp, "%s: bad flag 0x%x", - __func__, flags); - ASSERT(0); - error = -EIO; - break; - } - if (error) { - xlog_recover_free_trans(trans); - return error; - } - } + while ((dp < end) && num_logops) { + + ohead = (struct xlog_op_header *)dp; + dp += sizeof(*ohead); + ASSERT(dp <= end); + + /* errors will abort recovery */ + error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, + dp, end, pass); + if (error) + return error; + dp += be32_to_cpu(ohead->oh_len); num_logops--; } From e9131e50f9d0a632e3011d73f283ba69be0cc682 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 09:45:18 +1000 Subject: [PATCH 26/56] xfs: recovery of XLOG_UNMOUNT_TRANS leaks memory The XLOG_UNMOUNT_TRANS case skips the transaction, despite the fact an unmount record is always in a standalone transaction. Hence whenever we come across one of these we need to free the transaction structure associated with it as there is no commit record that follows it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8105b8571979..6d1c78378c31 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3534,6 +3534,9 @@ out: return error ? error : error2; } +/* + * On error or completion, trans is freed. + */ STATIC int xlog_recovery_process_trans( struct xlog *log, @@ -3543,7 +3546,8 @@ xlog_recovery_process_trans( unsigned int flags, int pass) { - int error = -EIO; + int error = 0; + bool freeit = false; /* mask off ophdr transaction container flags */ flags &= ~XLOG_END_TRANS; @@ -3565,18 +3569,19 @@ xlog_recovery_process_trans( /* unexpected flag values */ case XLOG_UNMOUNT_TRANS: + /* just skip trans */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); - error = 0; /* just skip trans */ + freeit = true; break; case XLOG_START_TRANS: - xfs_warn(log->l_mp, "%s: bad transaction", __func__); - ASSERT(0); - break; default: xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); ASSERT(0); + error = -EIO; break; } + if (error || freeit) + xlog_recover_free_trans(trans); return error; } @@ -3620,7 +3625,6 @@ xlog_recover_process_ophdr( int pass) { struct xlog_recover *trans; - int error; unsigned int len; /* Do we understand who wrote this op? */ @@ -3648,11 +3652,8 @@ xlog_recover_process_ophdr( return 0; } - error = xlog_recovery_process_trans(log, trans, dp, len, - ohead->oh_flags, pass); - if (error) - xlog_recover_free_trans(trans); - return error; + return xlog_recovery_process_trans(log, trans, dp, len, + ohead->oh_flags, pass); } /* From 88b863db97a18a04c90ebd57d84e1b7863114dcb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 09:45:32 +1000 Subject: [PATCH 27/56] xfs: fix double free in xlog_recover_commit_trans When an error occurs during buffer submission in xlog_recover_commit_trans(), we free the trans structure twice. Fix it by only freeing the structure in the caller regardless of the success or failure of the function. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6d1c78378c31..89574ff2566e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3528,8 +3528,6 @@ out: if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); - xlog_recover_free_trans(trans); - error2 = xfs_buf_delwri_submit(&buffer_list); return error ? error : error2; } @@ -3554,6 +3552,10 @@ xlog_recovery_process_trans( if (flags & XLOG_WAS_CONT_TRANS) flags &= ~XLOG_CONTINUE_TRANS; + /* + * Callees must not free the trans structure. We'll decide if we need to + * free it or not based on the operation being done and it's result. + */ switch (flags) { /* expected flag values */ case 0: @@ -3565,6 +3567,8 @@ xlog_recovery_process_trans( break; case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, trans, pass); + /* success or fail, we are now done with this transaction. */ + freeit = true; break; /* unexpected flag values */ From 76560669868d3b4d650d91d9bf467a8d81171766 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 09:45:42 +1000 Subject: [PATCH 28/56] xfs: reorganise transaction recovery item code The code for managing transactions anf the items for recovery is spread across 3 different locations in the file. Move them all together so that it is easy to read the code without needing to jump long distances in the file. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 358 +++++++++++++++++++-------------------- 1 file changed, 179 insertions(+), 179 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 89574ff2566e..6af9e1a74155 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1445,160 +1445,6 @@ xlog_clear_stale_blocks( ****************************************************************************** */ -STATIC xlog_recover_t * -xlog_recover_find_tid( - struct hlist_head *head, - xlog_tid_t tid) -{ - xlog_recover_t *trans; - - hlist_for_each_entry(trans, head, r_list) { - if (trans->r_log_tid == tid) - return trans; - } - return NULL; -} - -STATIC void -xlog_recover_new_tid( - struct hlist_head *head, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - INIT_LIST_HEAD(&trans->r_itemq); - - INIT_HLIST_NODE(&trans->r_list); - hlist_add_head(&trans->r_list, head); -} - -STATIC void -xlog_recover_add_item( - struct list_head *head) -{ - xlog_recover_item_t *item; - - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - INIT_LIST_HEAD(&item->ri_list); - list_add_tail(&item->ri_list, head); -} - -STATIC int -xlog_recover_add_to_cont_trans( - struct xlog *log, - struct xlog_recover *trans, - xfs_caddr_t dp, - int len) -{ - xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; - int old_len; - - if (list_empty(&trans->r_itemq)) { - /* finish copying rest of trans header */ - xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + - sizeof(xfs_trans_header_t) - len; - memcpy(ptr, dp, len); /* d, s, l */ - return 0; - } - /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); - - old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; - old_len = item->ri_buf[item->ri_cnt-1].i_len; - - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); - memcpy(&ptr[old_len], dp, len); /* d, s, l */ - item->ri_buf[item->ri_cnt-1].i_len += len; - item->ri_buf[item->ri_cnt-1].i_addr = ptr; - trace_xfs_log_recover_item_add_cont(log, trans, item, 0); - return 0; -} - -/* - * The next region to add is the start of a new region. It could be - * a whole region or it could be the first part of a new region. Because - * of this, the assumption here is that the type and size fields of all - * format structures fit into the first 32 bits of the structure. - * - * This works because all regions must be 32 bit aligned. Therefore, we - * either have both fields or we have neither field. In the case we have - * neither field, the data part of the region is zero length. We only have - * a log_op_header and can throw away the header since a new one will appear - * later. If we have at least 4 bytes, then we can determine how many regions - * will appear in the current log item. - */ -STATIC int -xlog_recover_add_to_trans( - struct xlog *log, - struct xlog_recover *trans, - xfs_caddr_t dp, - int len) -{ - xfs_inode_log_format_t *in_f; /* any will do */ - xlog_recover_item_t *item; - xfs_caddr_t ptr; - - if (!len) - return 0; - if (list_empty(&trans->r_itemq)) { - /* we need to catch log corruptions here */ - if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { - xfs_warn(log->l_mp, "%s: bad header magic number", - __func__); - ASSERT(0); - return -EIO; - } - if (len == sizeof(xfs_trans_header_t)) - xlog_recover_add_item(&trans->r_itemq); - memcpy(&trans->r_theader, dp, len); /* d, s, l */ - return 0; - } - - ptr = kmem_alloc(len, KM_SLEEP); - memcpy(ptr, dp, len); - in_f = (xfs_inode_log_format_t *)ptr; - - /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); - if (item->ri_total != 0 && - item->ri_total == item->ri_cnt) { - /* tail item is in use, get a new one */ - xlog_recover_add_item(&trans->r_itemq); - item = list_entry(trans->r_itemq.prev, - xlog_recover_item_t, ri_list); - } - - if (item->ri_total == 0) { /* first region to be added */ - if (in_f->ilf_size == 0 || - in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { - xfs_warn(log->l_mp, - "bad number of regions (%d) in inode log format", - in_f->ilf_size); - ASSERT(0); - kmem_free(ptr); - return -EIO; - } - - item->ri_total = in_f->ilf_size; - item->ri_buf = - kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - ASSERT(item->ri_total > item->ri_cnt); - /* Description region is ri_buf[0] */ - item->ri_buf[item->ri_cnt].i_addr = ptr; - item->ri_buf[item->ri_cnt].i_len = len; - item->ri_cnt++; - trace_xfs_log_recover_item_add(log, trans, item, 0); - return 0; -} - /* * Sort the log items in the transaction. * @@ -3254,31 +3100,6 @@ xlog_recover_do_icreate_pass2( return 0; } -/* - * Free up any resources allocated by the transaction - * - * Remember that EFIs, EFDs, and IUNLINKs are handled later. - */ -STATIC void -xlog_recover_free_trans( - struct xlog_recover *trans) -{ - xlog_recover_item_t *item, *n; - int i; - - list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { - /* Free the regions in the item. */ - list_del(&item->ri_list); - for (i = 0; i < item->ri_cnt; i++) - kmem_free(item->ri_buf[i].i_addr); - /* Free the item itself */ - kmem_free(item->ri_buf); - kmem_free(item); - } - /* Free the transaction recover structure */ - kmem_free(trans); -} - STATIC void xlog_recover_buffer_ra_pass2( struct xlog *log, @@ -3532,6 +3353,185 @@ out: return error ? error : error2; } + +STATIC xlog_recover_t * +xlog_recover_find_tid( + struct hlist_head *head, + xlog_tid_t tid) +{ + xlog_recover_t *trans; + + hlist_for_each_entry(trans, head, r_list) { + if (trans->r_log_tid == tid) + return trans; + } + return NULL; +} + +STATIC void +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) +{ + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); +} + +STATIC void +xlog_recover_add_item( + struct list_head *head) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); +} + +STATIC int +xlog_recover_add_to_cont_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + if (list_empty(&trans->r_itemq)) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); + return 0; + } + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + memcpy(&ptr[old_len], dp, len); + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return -EIO; + } + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ + xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); + } + + if (item->ri_total == 0) { /* first region to be added */ + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + kmem_free(ptr); + return -EIO; + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); + return 0; +} +/* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + struct xlog_recover *trans) +{ + xlog_recover_item_t *item, *n; + int i; + + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); + /* Free the item itself */ + kmem_free(item->ri_buf); + kmem_free(item); + } + /* Free the transaction recover structure */ + kmem_free(trans); +} + /* * On error or completion, trans is freed. */ From b818cca1976d1a01754033ac08724e05d07cce8f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 09:45:54 +1000 Subject: [PATCH 29/56] xfs: refactor recovery transaction start handling Rework the transaction lookup and allocation code in xlog_recovery_process_ophdr() to fold two related call-once helper functions into a single helper. Then fold in all the XLOG_START_TRANS logic to that helper to clean up the remaining logic in xlog_recovery_process_ophdr(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 77 ++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6af9e1a74155..5019f52e4cc2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3353,38 +3353,6 @@ out: return error ? error : error2; } - -STATIC xlog_recover_t * -xlog_recover_find_tid( - struct hlist_head *head, - xlog_tid_t tid) -{ - xlog_recover_t *trans; - - hlist_for_each_entry(trans, head, r_list) { - if (trans->r_log_tid == tid) - return trans; - } - return NULL; -} - -STATIC void -xlog_recover_new_tid( - struct hlist_head *head, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - INIT_LIST_HEAD(&trans->r_itemq); - - INIT_HLIST_NODE(&trans->r_list); - hlist_add_head(&trans->r_list, head); -} - STATIC void xlog_recover_add_item( struct list_head *head) @@ -3507,6 +3475,7 @@ xlog_recover_add_to_trans( trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } + /* * Free up any resources allocated by the transaction * @@ -3589,6 +3558,13 @@ xlog_recovery_process_trans( return error; } +/* + * Lookup the transaction recovery structure associated with the ID in the + * current ophdr. If the transaction doesn't exist and the start flag is set in + * the ophdr, then allocate a new transaction for future ID matches to find. + * Either way, return what we found during the lookup - an existing transaction + * or nothing. + */ STATIC struct xlog_recover * xlog_recover_ophdr_to_trans( struct hlist_head rhash[], @@ -3601,20 +3577,35 @@ xlog_recover_ophdr_to_trans( tid = be32_to_cpu(ohead->oh_tid); rhp = &rhash[XLOG_RHASH(tid)]; - trans = xlog_recover_find_tid(rhp, tid); - if (trans) - return trans; + hlist_for_each_entry(trans, rhp, r_list) { + if (trans->r_log_tid == tid) + return trans; + } /* - * If this is a new transaction, the ophdr only contains the - * start record. In that case, the only processing we need to do - * on this opheader is allocate a new recovery container to hold - * the recovery ops that will follow. + * skip over non-start transaction headers - we could be + * processing slack space before the next transaction starts + */ + if (!(ohead->oh_flags & XLOG_START_TRANS)) + return NULL; + + ASSERT(be32_to_cpu(ohead->oh_len) == 0); + + /* + * This is a new transaction so allocate a new recovery container to + * hold the recovery ops that will follow. + */ + trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = be64_to_cpu(rhead->h_lsn); + INIT_LIST_HEAD(&trans->r_itemq); + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, rhp); + + /* + * Nothing more to do for this ophdr. Items to be added to this new + * transaction will be in subsequent ophdr containers. */ - if (ohead->oh_flags & XLOG_START_TRANS) { - ASSERT(be32_to_cpu(ohead->oh_len) == 0); - xlog_recover_new_tid(rhp, tid, be64_to_cpu(rhead->h_lsn)); - } return NULL; } From e68ed77521f695d165cbae070f6dda8a4778438f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 10:43:15 +1000 Subject: [PATCH 30/56] xfs: fix use of agi_newino in finobt lookup Sparse warns that we are passing the big-endian valueo f agi_newino to the initial btree lookup function when trying to find a new inode. This is wrong - we need to pass the host order value, not the disk order value. This will adversely affect the next inode allocated, but given that the free inode btree is usually much smaller than the allocated inode btree it is much less likely to be a performance issue if we start the search in the wrong place. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d213a2eae95e..23dcb72fc5e6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino( int i; if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { - error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, - &i); + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); if (error) return error; if (i == 1) { @@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino( if (error) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); - return 0; } } From bf1ed3833078e3bb0ba8cd03468090b9359d0912 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 10:43:40 +1000 Subject: [PATCH 31/56] xfs: xfs_qm_dquot_isolate needs locking annotations for sparse To remove noise from the build. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 10232102b4a6..d68f23021af3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -434,6 +434,7 @@ xfs_qm_dquot_isolate( struct list_head *item, spinlock_t *lru_lock, void *arg) + __releases(lru_lock) __acquires(lru_lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); From e3aed1a08190c038c4ea41b73ea6f07bc0e3290c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 10:46:08 +1000 Subject: [PATCH 32/56] xfs: xfs_kset should be static As it is accessed through the struct xfs_mount and can be set up entirely from fs/xfs/xfs_super.c Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 3 --- fs/xfs/xfs_super.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fbf0384a466f..d36bdbc9eeb2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -61,8 +61,6 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; -extern struct kset *xfs_kset; - /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. @@ -729,7 +727,6 @@ xfs_mountfs( xfs_set_maxicount(mp); - mp->m_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dcd4b93dccdc..9f622feda6a4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -63,7 +63,7 @@ static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; -struct kset *xfs_kset; /* top-level xfs sysfs dir */ +static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif @@ -1411,6 +1411,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; sb->s_fs_info = mp; From b972d0797180d8414351d9dc8ff65071c692d058 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 29 Sep 2014 10:46:22 +1000 Subject: [PATCH 33/56] xfs: annotate user variables passed as void Some argument callbacks can contain user buffers, and sparse warns about passing them as void pointers. Cast appropriately to remove the sparse warnings. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3799695b9249..7a6b406ce6c9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1349,7 +1349,7 @@ xfs_ioc_setxflags( STATIC int xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) { - struct getbmap __user *base = *ap; + struct getbmap __user *base = (struct getbmap __user *)*ap; /* copy only getbmap portion (not getbmapx) */ if (copy_to_user(base, bmv, sizeof(struct getbmap))) @@ -1380,7 +1380,7 @@ xfs_ioc_getbmap( bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, - (struct getbmap *)arg+1); + (__force struct getbmap *)arg+1); if (error) return error; @@ -1393,7 +1393,7 @@ xfs_ioc_getbmap( STATIC int xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) { - struct getbmapx __user *base = *ap; + struct getbmapx __user *base = (struct getbmapx __user *)*ap; if (copy_to_user(base, bmv, sizeof(struct getbmapx))) return -EFAULT; @@ -1420,7 +1420,7 @@ xfs_ioc_getbmapx( return -EINVAL; error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, - (struct getbmapx *)arg+1); + (__force struct getbmapx *)arg+1); if (error) return error; From a870fe6dfaba1cc67424cde4cfd2cd3eee62bf35 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:02:28 +1000 Subject: [PATCH 34/56] xfs: force the log before shutting down When we have marked the filesystem for shutdown, we want to prevent any further buffer IO from being submitted. However, we currently force the log after marking the filesystem as shut down, hence allowing IO to the log *after* we have marked both the filesystem and the log as in an error state. Clean this up by forcing the log before we mark the filesytem with an error. This replaces the pure CIL flush that we currently have which works around this same issue (i.e the CIL can't be flushed once the shutdown flags are set) and hence enables us to clean up the logic substantially. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 55 +++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ca4fd5bd8522..85f36f212641 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3867,18 +3867,17 @@ xlog_state_ioerror( * This is called from xfs_force_shutdown, when we're forcibly * shutting down the filesystem, typically because of an IO error. * Our main objectives here are to make sure that: - * a. the filesystem gets marked 'SHUTDOWN' for all interested + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested * parties to find out, 'atomically'. - * b. those who're sleeping on log reservations, pinned objects and + * c. those who're sleeping on log reservations, pinned objects and * other resources get woken up, and be told the bad news. - * c. nothing new gets queued up after (a) and (b) are done. - * d. if !logerror, flush the iclogs to disk, then seal them off - * for business. + * d. nothing new gets queued up after (b) and (c) are done. * - * Note: for delayed logging the !logerror case needs to flush the regions - * held in memory out to the iclogs before flushing them to disk. This needs - * to be done before the log is marked as shutdown, otherwise the flush to the - * iclogs will fail. + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. */ int xfs_log_force_umount( @@ -3910,16 +3909,16 @@ xfs_log_force_umount( ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } - retval = 0; /* - * Flush the in memory commit item list before marking the log as - * being shut down. We need to do it in this order to ensure all the - * completed transactions are flushed to disk with the xfs_log_force() - * call below. + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. */ if (!logerror) - xlog_cil_force(log); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); /* * mark the filesystem and the as in a shutdown state and wake @@ -3931,18 +3930,11 @@ xfs_log_force_umount( XFS_BUF_DONE(mp->m_sb_bp); /* - * This flag is sort of redundant because of the mount flag, but - * it's good to maintain the separation between the log and the rest - * of XFS. + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. */ log->l_flags |= XLOG_IO_ERROR; - - /* - * If we hit a log error, we want to mark all the iclogs IOERROR - * while we're still holding the loglock. - */ - if (logerror) - retval = xlog_state_ioerror(log); + retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); /* @@ -3955,19 +3947,6 @@ xfs_log_force_umount( xlog_grant_head_wake_all(&log->l_reserve_head); xlog_grant_head_wake_all(&log->l_write_head); - if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(!logerror); - /* - * Force the incore logs to disk before shutting the - * log down completely. - */ - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); - - spin_lock(&log->l_icloglock); - retval = xlog_state_ioerror(log); - spin_unlock(&log->l_icloglock); - } - /* * Wake up everybody waiting on xfs_log_force. Wake the CIL push first * as if the log writes were completed. The abort handling in the log From cf53e99d192171a58791136d33fd3fea5d8bab35 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:01 +1000 Subject: [PATCH 35/56] xfs: Don't use xfs_buf_iowait in the delwri buffer code For the special case of delwri buffer submission and waiting, we don't need to issue IO synchronously at all. The second pass to call xfs_buf_iowait() can be replaced with blocking on xfs_buf_lock() - the buffer will be unlocked when the async IO is complete. This formalises a sane the method of waiting for async IO - take an extra reference, submit the IO, call xfs_buf_lock() when you want to wait for IO completion. i.e.: bp = xfs_buf_find(); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_iosubmit(bp); xfs_buf_lock(bp) error = bp->b_error; .... xfs_buf_relse(bp); While this is somewhat racy for gathering IO errors, none of the code that calls xfs_buf_delwri_submit() will race against other users of the buffers being submitted. Even if they do, we don't really care if the error is detected by the delwri code or the user we raced against. Either way, the error will be detected and handled. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca9b064..9dc4c2223035 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1813,12 +1813,17 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE; + bp->b_flags |= XBF_WRITE | XBF_ASYNC; - if (!wait) { - bp->b_flags |= XBF_ASYNC; + /* + * we do all Io submission async. This means if we need to wait + * for IO completion we need to take an extra reference so the + * buffer is still valid on the other side. + */ + if (wait) + xfs_buf_hold(bp); + else list_del_init(&bp->b_list); - } xfs_bdstrat_cb(bp); } blk_finish_plug(&plug); @@ -1866,7 +1871,10 @@ xfs_buf_delwri_submit( bp = list_first_entry(&io_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); - error2 = xfs_buf_iowait(bp); + + /* locking the buffer will wait for async IO completion. */ + xfs_buf_lock(bp); + error2 = bp->b_error; xfs_buf_relse(bp); if (!error) error = error2; From e11bb8052c3f500e66142f33579cc054d691a8fb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:11 +1000 Subject: [PATCH 36/56] xfs: synchronous buffer IO needs a reference When synchronous IO runs IO completion work, it does so without an IO reference or a hold reference on the buffer. The IO "hold reference" is owned by the submitter, and released when the submission is complete. The IO reference is released when both the submitter and the bio end_io processing is run, and so if the io completion work is run from IO completion context, it is run without an IO reference. Hence we can get the situation where the submitter can submit the IO, see an error on the buffer and unlock and free the buffer while there is still IO in progress. This leads to use-after-free and memory corruption. Fix this by taking a "sync IO hold" reference that is owned by the IO and not released until after the buffer completion calls are run to wake up synchronous waiters. This means that the buffer will not be freed in any circumstance until all IO processing is completed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 53 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9dc4c2223035..48b1e2989ea4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1019,6 +1019,9 @@ xfs_buf_iodone_work( else { ASSERT(read && bp->b_ops); complete(&bp->b_iowait); + + /* release the !XBF_ASYNC ref now we are done. */ + xfs_buf_rele(bp); } } @@ -1044,6 +1047,7 @@ xfs_buf_ioend( } else { bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); + xfs_buf_rele(bp); } } @@ -1086,8 +1090,11 @@ xfs_bioerror( xfs_buf_ioerror(bp, -EIO); /* - * We're calling xfs_buf_ioend, so delete XBF_DONE flag. + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For + * sync IO, xfs_buf_ioend is going to remove a ref here. */ + if (!(bp->b_flags & XBF_ASYNC)) + xfs_buf_hold(bp); XFS_BUF_UNREAD(bp); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); @@ -1383,22 +1390,48 @@ xfs_buf_iorequest( if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); - xfs_buf_hold(bp); /* - * Set the count to 1 initially, this will stop an I/O - * completion callout which happens before we have started - * all the I/O from calling xfs_buf_ioend too early. + * Take references to the buffer. For XBF_ASYNC buffers, holding a + * reference for as long as submission takes is all that is necessary + * here. The IO inherits the lock and hold count from the submitter, + * and these are release during IO completion processing. Taking a hold + * over submission ensures that the buffer is not freed until we have + * completed all processing, regardless of when IO errors occur or are + * reported. + * + * However, for synchronous IO, the IO does not inherit the submitters + * reference count, nor the buffer lock. Hence we need to take an extra + * reference to the buffer for the for the IO context so that we can + * guarantee the buffer is not freed until all IO completion processing + * is done. Otherwise the caller can drop their reference while the IO + * is still in progress and hence trigger a use-after-free situation. + */ + xfs_buf_hold(bp); + if (!(bp->b_flags & XBF_ASYNC)) + xfs_buf_hold(bp); + + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); + /* - * If _xfs_buf_ioapply failed, we'll get back here with - * only the reference we took above. _xfs_buf_ioend will - * drop it to zero, so we'd better not queue it for later, - * or we'll free it before it's done. + * If _xfs_buf_ioapply failed or we are doing synchronous IO that + * completes extremely quickly, we can get back here with only the IO + * reference we took above. _xfs_buf_ioend will drop it to zero. Run + * completion processing synchronously so that we don't return to the + * caller with completion still pending. This avoids unnecessary context + * switches associated with the end_io workqueue. */ - _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + _xfs_buf_ioend(bp, 0); + else + _xfs_buf_ioend(bp, 1); xfs_buf_rele(bp); } From e8aaba9a783c8e5d2c58ebe69650ea31b91bb745 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:22 +1000 Subject: [PATCH 37/56] xfs: xfs_buf_ioend and xfs_buf_iodone_work duplicate functionality We do some work in xfs_buf_ioend, and some work in xfs_buf_iodone_work, but much of that functionality is the same. This work can all be done in a single function, leaving xfs_buf_iodone just a wrapper to determine if we should execute it by workqueue or directly. hence rename xfs_buf_iodone_work to xfs_buf_ioend(), and add a new xfs_buf_ioend_async() for places that need async processing. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 88 ++++++++++++++++++---------------------- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_buf_item.c | 4 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 6 files changed, 45 insertions(+), 55 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 48b1e2989ea4..a046149e6099 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -998,26 +998,30 @@ xfs_buf_wait_unpin( * Buffer Utility Routines */ -STATIC void -xfs_buf_iodone_work( - struct work_struct *work) +void +xfs_buf_ioend( + struct xfs_buf *bp) { - struct xfs_buf *bp = - container_of(work, xfs_buf_t, b_iodone_work); - bool read = !!(bp->b_flags & XBF_READ); + bool read = bp->b_flags & XBF_READ; + + trace_xfs_buf_iodone(bp, _RET_IP_); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* only validate buffers that were read without errors */ - if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) + /* Only validate buffers that were read without errors */ + if (read && !bp->b_error && bp->b_ops) { + ASSERT(!bp->b_iodone); bp->b_ops->verify_read(bp); + } + + if (!bp->b_error) + bp->b_flags |= XBF_DONE; if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else { - ASSERT(read && bp->b_ops); complete(&bp->b_iowait); /* release the !XBF_ASYNC ref now we are done. */ @@ -1025,30 +1029,22 @@ xfs_buf_iodone_work( } } -void -xfs_buf_ioend( - struct xfs_buf *bp, - int schedule) +static void +xfs_buf_ioend_work( + struct work_struct *work) { - bool read = !!(bp->b_flags & XBF_READ); + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_iodone_work); - trace_xfs_buf_iodone(bp, _RET_IP_); + xfs_buf_ioend(bp); +} - if (bp->b_error == 0) - bp->b_flags |= XBF_DONE; - - if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { - if (schedule) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); - } else { - xfs_buf_iodone_work(&bp->b_iodone_work); - } - } else { - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - complete(&bp->b_iowait); - xfs_buf_rele(bp); - } +void +xfs_buf_ioend_async( + struct xfs_buf *bp) +{ + INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); } void @@ -1099,7 +1095,7 @@ xfs_bioerror( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); return -EIO; } @@ -1185,15 +1181,6 @@ xfs_bwrite( return error; } -STATIC void -_xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) -{ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) - xfs_buf_ioend(bp, schedule); -} - STATIC void xfs_buf_bio_end_io( struct bio *bio, @@ -1211,7 +1198,8 @@ xfs_buf_bio_end_io( if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - _xfs_buf_ioend(bp, 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend_async(bp); bio_put(bio); } @@ -1423,15 +1411,17 @@ xfs_buf_iorequest( /* * If _xfs_buf_ioapply failed or we are doing synchronous IO that * completes extremely quickly, we can get back here with only the IO - * reference we took above. _xfs_buf_ioend will drop it to zero. Run - * completion processing synchronously so that we don't return to the - * caller with completion still pending. This avoids unnecessary context - * switches associated with the end_io workqueue. + * reference we took above. If we drop it to zero, run completion + * processing synchronously so that we don't return to the caller with + * completion still pending. This avoids unnecessary context switches + * associated with the end_io workqueue. */ - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) - _xfs_buf_ioend(bp, 0); - else - _xfs_buf_ioend(bp, 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); + } xfs_buf_rele(bp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c753183900b3..4585c1595a98 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -286,7 +286,7 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioend(struct xfs_buf *bp); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); extern void xfs_buf_iorequest(xfs_buf_t *); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 76007deed31f..4fd41b58e6d2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -491,7 +491,7 @@ xfs_buf_item_unpin( xfs_buf_ioerror(bp, -EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } } @@ -1115,7 +1115,7 @@ do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fea3c92fb3f0..00d210bbf3c3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3056,7 +3056,7 @@ cluster_corrupt_out: XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } else { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 85f36f212641..3567396f4428 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1678,7 +1678,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, -EIO); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fd5787add99..4ba19bf7da1f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -383,7 +383,7 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* From 61be9c529a4a715ab8679e9ca82bc3790c7ab66c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:31 +1000 Subject: [PATCH 38/56] xfs: rework xfs_buf_bio_endio error handling Currently the report of a bio error from completion immediately marks the buffer with an error. The issue is that this is racy w.r.t. synchronous IO - the submitter can see b_error being set before the IO is complete, and hence we cannot differentiate between submission failures and completion failures. Add an internal b_io_error field protected by the b_lock to catch IO completion errors, and only propagate that to the buffer during final IO completion handling. Hence we can tell in xfs_buf_iorequest if we've had a submission failure bey checking bp->b_error before dropping our b_io_remaining reference - that reference will prevent b_io_error values from being propagated to b_error in the event that completion races with submission. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 ++++++++++++++++-- fs/xfs/xfs_buf.h | 1 + 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a046149e6099..170d6c0afe71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1008,6 +1008,13 @@ xfs_buf_ioend( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + /* + * Pull in IO completion errors now. We are guaranteed to be running + * single threaded, so we don't need the lock to read b_io_error. + */ + if (!bp->b_error && bp->b_io_error) + xfs_buf_ioerror(bp, bp->b_io_error); + /* Only validate buffers that were read without errors */ if (read && !bp->b_error && bp->b_ops) { ASSERT(!bp->b_iodone); @@ -1192,8 +1199,12 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (!bp->b_error) - xfs_buf_ioerror(bp, error); + if (error) { + spin_lock(&bp->b_lock); + if (!bp->b_io_error) + bp->b_io_error = error; + spin_unlock(&bp->b_lock); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1379,6 +1390,9 @@ xfs_buf_iorequest( if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + /* * Take references to the buffer. For XBF_ASYNC buffers, holding a * reference for as long as submission takes is all that is necessary diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4585c1595a98..44db8cd67bda 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -158,6 +158,7 @@ typedef struct xfs_buf { struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ + int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ From 8dac39219827113f14e97507646a610ca426b69e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:40 +1000 Subject: [PATCH 39/56] xfs: kill xfs_bdstrat_cb Only has two callers, and is just a shutdown check and error handler around xfs_buf_iorequest. However, the error handling is a mess of read and write semantics, and both internal callers only call it for writes. Hence kill the wrapper, and follow up with a patch to sanitise the error handling. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 170d6c0afe71..0eee0f1f099e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1146,27 +1146,6 @@ xfs_bioerror_relse( return -EIO; } -STATIC int -xfs_bdstrat_cb( - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) - return xfs_bioerror_relse(bp); - else - return xfs_bioerror(bp); - } - - xfs_buf_iorequest(bp); - return 0; -} - int xfs_bwrite( struct xfs_buf *bp) @@ -1178,7 +1157,20 @@ xfs_bwrite( bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); - xfs_bdstrat_cb(bp); + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + + /* + * Metadata write that didn't get logged but written anyway. + * These aren't associated with a transaction, and can be + * ignored. + */ + if (!bp->b_iodone) + return xfs_bioerror_relse(bp); + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { @@ -1861,7 +1853,17 @@ __xfs_buf_delwri_submit( xfs_buf_hold(bp); else list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + + if (!bp->b_iodone) + xfs_bioerror_relse(bp); + else + xfs_bioerror(bp); + continue; + } + xfs_buf_iorequest(bp); } blk_finish_plug(&plug); From 2718775469a521c8b35442db5d665ac0c8c3c8ac Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:04:56 +1000 Subject: [PATCH 40/56] xfs: xfs_bioerror can die. Internal buffer write error handling is a mess due to the unnatural split between xfs_bioerror and xfs_bioerror_relse(). xfs_bwrite() only does sync IO and determines the handler to call based on b_iodone, so for this caller the only difference between xfs_bioerror() and xfs_bioerror_release() is the XBF_DONE flag. We don't care what the XBF_DONE flag state is because we stale the buffer in both paths - the next buffer lookup will clear XBF_DONE because XBF_STALE is set. Hence we can use common error handling for xfs_bwrite(). __xfs_buf_delwri_submit() is a similar - it's only ever called on writes - all sync or async - and again there's no reason to handle them any differently at all. Clean up the nasty error handling and remove xfs_bioerror(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 58 ++++++++++-------------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0eee0f1f099e..409a8a06642a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1074,39 +1074,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } -/* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call xfs_buf_ioend - * so that the proper iodone callbacks get called. - */ -STATIC int -xfs_bioerror( - xfs_buf_t *bp) -{ -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned, we aren't flushing it. - */ - xfs_buf_ioerror(bp, -EIO); - - /* - * We're calling xfs_buf_ioend, so delete XBF_DONE flag. For - * sync IO, xfs_buf_ioend is going to remove a ref here. - */ - if (!(bp->b_flags & XBF_ASYNC)) - xfs_buf_hold(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - - xfs_buf_ioend(bp); - - return -EIO; -} - /* * Same as xfs_bioerror, except that we are releasing the buffer * here ourselves, and avoiding the xfs_buf_ioend call. @@ -1155,19 +1122,19 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | + XBF_WRITE_FAIL | XBF_DONE); if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but written anyway. - * These aren't associated with a transaction, and can be - * ignored. - */ - if (!bp->b_iodone) - return xfs_bioerror_relse(bp); - return xfs_bioerror(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + + /* sync IO, xfs_buf_ioend is going to remove a ref here */ + xfs_buf_hold(bp); + xfs_buf_ioend(bp); + return -EIO; } xfs_buf_iorequest(bp); @@ -1857,10 +1824,9 @@ __xfs_buf_delwri_submit( if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - if (!bp->b_iodone) - xfs_bioerror_relse(bp); - else - xfs_bioerror(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); continue; } xfs_buf_iorequest(bp); From 8b131973d1628f1a0c5a36fe02269d696bbe60a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:05 +1000 Subject: [PATCH 41/56] xfs: kill xfs_bioerror_relse There is only one caller now - xfs_trans_read_buf_map() - and it has very well defined call semantics - read, synchronous, and b_iodone is NULL. Hence it's pretty clear what error handling is necessary for this case. The bigger problem of untangling xfs_trans_read_buf_map error handling is left to a future patch. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 39 --------------------------------------- fs/xfs/xfs_buf.h | 2 -- fs/xfs/xfs_trans_buf.c | 9 ++++++--- 3 files changed, 6 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 409a8a06642a..108eba7ad5c1 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1074,45 +1074,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the xfs_buf_ioend call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - struct xfs_buf *bp) -{ - int64_t fl = bp->b_flags; - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_DONE(bp); - xfs_buf_stale(bp); - bp->b_iodone = NULL; - if (!(fl & XBF_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - xfs_buf_ioerror(bp, -EIO); - complete(&bp->b_iowait); - } else { - xfs_buf_relse(bp); - } - - return -EIO; -} - int xfs_bwrite( struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 44db8cd67bda..d8f57f654f92 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -297,8 +297,6 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -extern int xfs_bioerror_relse(struct xfs_buf *); - /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 96c898e7ac9a..db4be5b1d732 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -324,11 +324,14 @@ xfs_trans_read_buf_map( */ if (XFS_FORCED_SHUTDOWN(mp)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - } else { - xfs_buf_iorequest(bp); + bp->b_flags &= ~(XBF_READ | XBF_DONE); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_relse(bp); + return -EIO; } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); From 595bff75dce51e0d6d94877b4b6d11b4747a63fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:14 +1000 Subject: [PATCH 42/56] xfs: introduce xfs_buf_submit[_wait] There is a lot of cookie-cutter code that looks like: if (shutdown) handle buffer error xfs_buf_iorequest(bp) error = xfs_buf_iowait(bp) if (error) handle buffer error spread through XFS. There's significant complexity now in xfs_buf_iorequest() to specifically handle this sort of synchronous IO pattern, but there's all sorts of nasty surprises in different error handling code dependent on who owns the buffer references and the locks. Pull this pattern into a single helper, where we can hide all the synchronous IO warts and hence make the error handling for all the callers much saner. This removes the need for a special extra reference to protect IO completion processing, as we can now hold a single reference across dispatch and waiting, simplifying the sync IO smeantics and error handling. In doing this, also rename xfs_buf_iorequest to xfs_buf_submit and make it explicitly handle on asynchronous IO. This forces all users to be switched specifically to one interface or the other and removes any ambiguity between how the interfaces are to be used. It also means that xfs_buf_iowait() goes away. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 14 +--- fs/xfs/xfs_buf.c | 171 +++++++++++++++++++++------------------ fs/xfs/xfs_buf.h | 4 +- fs/xfs/xfs_buf_item.c | 4 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 30 +++---- fs/xfs/xfs_trace.h | 3 +- fs/xfs/xfs_trans_buf.c | 19 +---- 8 files changed, 118 insertions(+), 129 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2f1e30d39a35..c53cc036d6e7 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1157,12 +1157,7 @@ xfs_zero_remaining_bytes( XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_buf_ioerror_alert(bp, "xfs_zero_remaining_bytes(read)"); @@ -1175,12 +1170,7 @@ xfs_zero_remaining_bytes( XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_buf_ioerror_alert(bp, "xfs_zero_remaining_bytes(write)"); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 108eba7ad5c1..d99ec8335750 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -623,10 +623,11 @@ _xfs_buf_read( bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - xfs_buf_iorequest(bp); - if (flags & XBF_ASYNC) + if (flags & XBF_ASYNC) { + xfs_buf_submit(bp); return 0; - return xfs_buf_iowait(bp); + } + return xfs_buf_submit_wait(bp); } xfs_buf_t * @@ -708,12 +709,7 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { - xfs_buf_relse(bp); - return NULL; - } - xfs_buf_iorequest(bp); - xfs_buf_iowait(bp); + xfs_buf_submit_wait(bp); return bp; } @@ -1028,12 +1024,8 @@ xfs_buf_ioend( (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); - else { + else complete(&bp->b_iowait); - - /* release the !XBF_ASYNC ref now we are done. */ - xfs_buf_rele(bp); - } } static void @@ -1086,21 +1078,7 @@ xfs_bwrite( bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL | XBF_DONE); - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - - /* sync IO, xfs_buf_ioend is going to remove a ref here */ - xfs_buf_hold(bp); - xfs_buf_ioend(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); @@ -1209,7 +1187,7 @@ next_chunk: } else { /* * This is guaranteed not to be the last io reference count - * because the caller (xfs_buf_iorequest) holds a count itself. + * because the caller (xfs_buf_submit) holds a count itself. */ atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, -EIO); @@ -1299,13 +1277,29 @@ _xfs_buf_ioapply( blk_finish_plug(&plug); } +/* + * Asynchronous IO submission path. This transfers the buffer lock ownership and + * the current reference to the IO. It is not safe to reference the buffer after + * a call to this function unless the caller holds an additional reference + * itself. + */ void -xfs_buf_iorequest( - xfs_buf_t *bp) +xfs_buf_submit( + struct xfs_buf *bp) { - trace_xfs_buf_iorequest(bp, _RET_IP_); + trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + ASSERT(bp->b_flags & XBF_ASYNC); + + /* on shutdown we stale and complete the buffer immediately */ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + return; + } if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1314,25 +1308,14 @@ xfs_buf_iorequest( bp->b_io_error = 0; /* - * Take references to the buffer. For XBF_ASYNC buffers, holding a - * reference for as long as submission takes is all that is necessary - * here. The IO inherits the lock and hold count from the submitter, - * and these are release during IO completion processing. Taking a hold - * over submission ensures that the buffer is not freed until we have - * completed all processing, regardless of when IO errors occur or are - * reported. - * - * However, for synchronous IO, the IO does not inherit the submitters - * reference count, nor the buffer lock. Hence we need to take an extra - * reference to the buffer for the for the IO context so that we can - * guarantee the buffer is not freed until all IO completion processing - * is done. Otherwise the caller can drop their reference while the IO - * is still in progress and hence trigger a use-after-free situation. + * The caller's reference is released during I/O completion. + * This occurs some time after the last b_io_remaining reference is + * released, so after we drop our Io reference we have to have some + * other reference to ensure the buffer doesn't go away from underneath + * us. Take a direct reference to ensure we have safe access to the + * buffer until we are finished with it. */ xfs_buf_hold(bp); - if (!(bp->b_flags & XBF_ASYNC)) - xfs_buf_hold(bp); - /* * Set the count to 1 initially, this will stop an I/O completion @@ -1343,40 +1326,82 @@ xfs_buf_iorequest( _xfs_buf_ioapply(bp); /* - * If _xfs_buf_ioapply failed or we are doing synchronous IO that - * completes extremely quickly, we can get back here with only the IO - * reference we took above. If we drop it to zero, run completion - * processing synchronously so that we don't return to the caller with - * completion still pending. This avoids unnecessary context switches - * associated with the end_io workqueue. + * If _xfs_buf_ioapply failed, we can get back here with only the IO + * reference we took above. If we drop it to zero, run completion so + * that we don't return to the caller with completion still pending. */ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) + if (bp->b_error) xfs_buf_ioend(bp); else xfs_buf_ioend_async(bp); } xfs_buf_rele(bp); + /* Note: it is not safe to reference bp now we've dropped our ref */ } /* - * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer, in which - * case nothing will ever complete. It returns the I/O error code, if any, or - * 0 if there was no error. + * Synchronous buffer IO submission path, read or write. */ int -xfs_buf_iowait( - xfs_buf_t *bp) +xfs_buf_submit_wait( + struct xfs_buf *bp) { + int error; + + trace_xfs_buf_submit_wait(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + bp->b_flags &= ~XBF_DONE; + return -EIO; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * For synchronous IO, the IO does not inherit the submitters reference + * count, nor the buffer lock. Hence we cannot release the reference we + * are about to take until we've waited for all IO completion to occur, + * including any xfs_buf_ioend_async() work that may be pending. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * make sure we run completion synchronously if it raced with us and is + * already complete. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp); + + /* wait for completion before gathering the error from the buffer */ trace_xfs_buf_iowait(bp, _RET_IP_); - - if (!bp->b_error) - wait_for_completion(&bp->b_iowait); - + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); - return bp->b_error; + error = bp->b_error; + + /* + * all done now, we can release the hold that keeps the buffer + * referenced for the entire IO. + */ + xfs_buf_rele(bp); + return error; } xfs_caddr_t @@ -1782,15 +1807,7 @@ __xfs_buf_delwri_submit( else list_del_init(&bp->b_list); - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_ioend(bp); - continue; - } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d8f57f654f92..0acfc30ec0fd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -290,8 +290,8 @@ extern int xfs_bwrite(struct xfs_buf *bp); extern void xfs_buf_ioend(struct xfs_buf *bp); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern void xfs_buf_iorequest(xfs_buf_t *); -extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_submit(struct xfs_buf *bp); +extern int xfs_buf_submit_wait(struct xfs_buf *bp); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); #define xfs_buf_zero(bp, off, len) \ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 4fd41b58e6d2..cbea9099b843 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks( * a way to shut the filesystem down if the writes keep failing. * * In practice we'll shut the filesystem down soon as non-transient - * erorrs tend to affect the whole device and a failing log write + * errors tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { @@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks( if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); } else { xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3567396f4428..fe88ef67f93a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1688,7 +1688,7 @@ xlog_bdstrat( return 0; } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); return 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4ba19bf7da1f..980e2968b907 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,12 +193,8 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) - return -EIO; - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); - if (error) + error = xfs_buf_submit_wait(bp); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -378,9 +374,11 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_buf_ioerror_alert(bp, __func__); - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); + if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } } bp->b_iodone = NULL; xfs_buf_ioend(bp); @@ -4427,16 +4425,12 @@ xlog_do_recover( XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) { - xfs_buf_relse(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); - ASSERT(0); + if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_ioerror_alert(bp, __func__); + ASSERT(0); + } xfs_buf_relse(bp); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 152f82782630..51372e34d988 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); -DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_submit); +DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index db4be5b1d732..e2b2216b1635 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -318,23 +318,10 @@ xfs_trans_read_buf_map( XFS_BUF_READ(bp); bp->b_ops = ops; - /* - * XXX(hch): clean up the error handling here to be less - * of a mess.. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_DONE); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_relse(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read From ba3726742c1712c43c5a18245476f3fe9fe74773 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:05:32 +1000 Subject: [PATCH 43/56] xfs: check xfs_buf_read_uncached returns correctly xfs_buf_read_uncached() has two failure modes. If can either return NULL or bp->b_error != 0 depending on the type of failure, and not all callers check for both. Fix it so that xfs_buf_read_uncached() always returns the error status, and the buffer is returned as a function parameter. The buffer will only be returned on success. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 18 ++++++++++---- fs/xfs/xfs_buf.h | 6 ++--- fs/xfs/xfs_fsops.c | 11 +++------ fs/xfs/xfs_mount.c | 57 +++++++++++++++++++++----------------------- fs/xfs/xfs_rtalloc.c | 30 ++++++++++------------- 5 files changed, 59 insertions(+), 63 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d99ec8335750..6fbcbbfb4d1a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -688,29 +688,39 @@ xfs_buf_readahead_map( * Read an uncached buffer from disk. Allocates and returns a locked * buffer containing the disk contents or nothing. */ -struct xfs_buf * +int xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + *bpp = NULL; + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) - return NULL; + return -ENOMEM; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); - bp->b_bn = daddr; + bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; xfs_buf_submit_wait(bp); - return bp; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } + + *bpp = bp; + return 0; } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 0acfc30ec0fd..82002c00af90 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -269,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags, - const struct xfs_buf_ops *ops); +int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, + size_t numblks, int flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f91de1ef05e1..c05ac8b70fa9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -172,16 +172,11 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fbf0384a466f..142c460fc64e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -302,21 +302,15 @@ xfs_readsb( * access to the superblock. */ reread: - bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, buf_ops); - if (!bp) { - if (loud) - xfs_warn(mp, "SB buffer read failed"); - return -EIO; - } - if (bp->b_error) { - error = bp->b_error; + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, &bp, buf_ops); + if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; - goto release_buf; + return error; } /* @@ -546,40 +540,43 @@ xfs_set_inoalignment(xfs_mount_t *mp) * Check that the data (and log if separate) is an ok size. */ STATIC int -xfs_check_sizes(xfs_mount_t *mp) +xfs_check_sizes( + struct xfs_mount *mp) { - xfs_buf_t *bp; + struct xfs_buf *bp; xfs_daddr_t d; + int error; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { xfs_warn(mp, "filesystem size mismatch detected"); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) { + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "last sector read failed"); - return -EIO; + return error; } xfs_buf_relse(bp); - if (mp->m_logdev_targp != mp->m_ddev_targp) { - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { - xfs_warn(mp, "log size mismatch detected"); - return -EFBIG; - } - bp = xfs_buf_read_uncached(mp->m_logdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { - xfs_warn(mp, "log device read failed"); - return -EIO; - } - xfs_buf_relse(bp); + if (mp->m_logdev_targp == mp->m_ddev_targp) + return 0; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return -EFBIG; } + error = xfs_buf_read_uncached(mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "log device read failed"); + return error; + } + xfs_buf_relse(bp); return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 909e143b87ae..1ad00937b485 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -972,16 +972,11 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); /* @@ -1235,11 +1230,12 @@ xfs_rtallocate_extent( */ int /* error */ xfs_rtmount_init( - xfs_mount_t *mp) /* file system mount structure */ + struct xfs_mount *mp) /* file system mount structure */ { - xfs_buf_t *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - xfs_sb_t *sbp; /* filesystem superblock copy in mount */ + struct xfs_buf *bp; /* buffer for last block of subvolume */ + struct xfs_sb *sbp; /* filesystem superblock copy in mount */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; sbp = &mp->m_sb; if (sbp->sb_rblocks == 0) @@ -1265,14 +1261,12 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp || bp->b_error) { + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "realtime device size check failed"); - if (bp) - xfs_buf_relse(bp); - return -EIO; + return error; } xfs_buf_relse(bp); return 0; From 8c15612546bce1ecafb7dee3cce8a2a9b560e15e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Oct 2014 09:05:44 +1000 Subject: [PATCH 44/56] xfs: simplify xfs_zero_remaining_bytes xfs_zero_remaining_bytes() open codes a log of buffer manupulations to do a read forllowed by a write. It can simply be replaced by an uncached read followed by a xfs_bwrite() call. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 44 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c53cc036d6e7..aa70620e0e62 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes( if (endoff > XFS_ISIZE(ip)) endoff = XFS_ISIZE(ip); - bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - BTOBB(mp->m_sb.sb_blocksize), 0); - if (!bp) - return -ENOMEM; - - xfs_buf_unlock(bp); - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { uint lock_mode; @@ -1152,32 +1144,24 @@ xfs_zero_remaining_bytes( ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (imap.br_state == XFS_EXT_UNWRITTEN) continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - error = xfs_buf_submit_wait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(read)"); - break; - } + error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + xfs_fsb_to_db(ip, imap.br_startblock), + BTOBB(mp->m_sb.sb_blocksize), + 0, &bp, NULL); + if (error) + return error; + memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); - error = xfs_buf_submit_wait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(write)"); - break; - } + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + return error; } - xfs_buf_free(bp); return error; } From b1d6cc02f2f6a590c4d8dc2c3bcf7be3b9419945 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:17:58 +1000 Subject: [PATCH 45/56] xfs: compat_xfs_bstat does not have forkoff struct compat_xfs_bstat is missing the di_forkoff field and so does not fully translate the structure correctly. Fix it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl32.c | 2 ++ fs/xfs/xfs_ioctl32.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a554646ff141..94ce027e28e3 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -160,6 +160,7 @@ xfs_ioctl32_bstat_copyin( get_user(bstat->bs_gen, &bstat32->bs_gen) || get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) || get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || get_user(bstat->bs_aextents, &bstat32->bs_aextents)) @@ -214,6 +215,7 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_gen, &p32->bs_gen) || put_user(buffer->bs_projid, &p32->bs_projid) || put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_forkoff, &p32->bs_forkoff) || put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 80f4060e8970..b1bb45444df8 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat { __u32 bs_gen; /* generation count */ __u16 bs_projid_lo; /* lower part of project id */ #define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* high part of project id */ - unsigned char bs_pad[12]; /* pad space, unused */ + unsigned char bs_pad[10]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ From e076b0f3a5c472e77c0a0e163188f2761e8b4fed Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:18:13 +1000 Subject: [PATCH 46/56] xfs: kill time.h The typedef for timespecs and nanotime() are completely unnecessary, and delay() can be moved to fs/xfs/linux.h, which means this file can go away. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/kmem.c | 1 - fs/xfs/time.h | 36 ------------------------------------ fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_linux.h | 6 +++++- fs/xfs/xfs_trans_inode.c | 2 +- 5 files changed, 8 insertions(+), 41 deletions(-) delete mode 100644 fs/xfs/time.h diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 844e288b9576..53e95b2a1369 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -21,7 +21,6 @@ #include #include #include -#include "time.h" #include "kmem.h" #include "xfs_message.h" diff --git a/fs/xfs/time.h b/fs/xfs/time.h deleted file mode 100644 index 387e695a184c..000000000000 --- a/fs/xfs/time.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_TIME_H__ -#define __XFS_SUPPORT_TIME_H__ - -#include -#include - -typedef struct timespec timespec_t; - -static inline void delay(long ticks) -{ - schedule_timeout_uninterruptible(ticks); -} - -static inline void nanotime(struct timespec *tvp) -{ - *tvp = CURRENT_TIME; -} - -#endif /* __XFS_SUPPORT_TIME_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c92cb48617d1..4c130ffcf0cf 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -654,7 +654,7 @@ xfs_ialloc( xfs_inode_t *ip; uint flags; int error; - timespec_t tv; + struct timespec tv; /* * Call the space management code to pick @@ -720,7 +720,7 @@ xfs_ialloc( ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - nanotime(&tv); + tv = current_fs_time(mp->m_super); ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; ip->i_d.di_atime = ip->i_d.di_mtime; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index d10dc8f397c9..6a51619d8690 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -56,7 +56,6 @@ typedef __uint64_t __psunsigned_t; #include "kmem.h" #include "mrlock.h" -#include "time.h" #include "uuid.h" #include @@ -179,6 +178,11 @@ typedef __uint64_t __psunsigned_t; #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +static inline void delay(long ticks) +{ + schedule_timeout_uninterruptible(ticks); +} + /* * XFS wrapper structure for sysfs support. It depends on external data * structures and is embedded in various internal data structures to implement diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 50c3f5614288..cdb4d86520e1 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -70,7 +70,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - timespec_t tv; + struct timespec tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); From 9336e3a765b68d4a7fdd8256f393ebce95ecb0a7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:18:40 +1000 Subject: [PATCH 47/56] xfs: project id inheritance is a directory only flag xfs_set_diflags() allows it to be set on non-directory inodes, and this flags errors in xfs_repair. Further, inode allocation allows the same directory-only flag to be inherited to non-directories. Make sure directory inode flags don't appear on other types of inodes. This fixes several xfstests scratch fileystem corruption reports (e.g. xfs/050) now that xfstests checks scratch filesystems after test completion. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_ioctl.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4c130ffcf0cf..2f63742a25f3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -769,6 +769,8 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; @@ -789,8 +791,6 @@ xfs_ialloc( if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3799695b9249..05a19554efbc 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -968,8 +968,6 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NOATIME; if (xflags & XFS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; if (xflags & XFS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) @@ -981,6 +979,8 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; From a872703f34cd6033d0b174fa598f63f1a57145bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Oct 2014 09:20:30 +1000 Subject: [PATCH 48/56] xfs: only set extent size hint when asked Currently the extent size hint is set unconditionally in xfs_ioctl_setattr() when the FSX_EXTSIZE flag is set. Hence we can set hints when the inode flags indicating the hint should be used are not set. Hence only set the extent size hint from userspace when the inode has the XFS_DIFLAG_EXTSIZE flag set to indicate that we should have an extent size hint set on the inode. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 05a19554efbc..d6afc9f144b1 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1231,13 +1231,25 @@ xfs_ioctl_setattr( } - if (mask & FSX_EXTSIZE) - ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; if (mask & FSX_XFLAGS) { xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); } + /* + * Only set the extent size hint if we've already determined that the + * extent size hint should be set on the inode. If no extent size flags + * are set on the inode then unconditionally clear the extent size hint. + */ + if (mask & FSX_EXTSIZE) { + int extsize = 0; + + if (ip->i_d.di_flags & + (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + ip->i_d.di_extsize = extsize; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); From ce57bcf6b81caf1e9f780e98e8d23d3555746d74 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 2 Oct 2014 09:21:53 +1000 Subject: [PATCH 49/56] xfs: check for inode size overflow in xfs_new_eof() If we write to the maximum file offset (2^63-2), XFS fails to log the inode size update when the page is flushed. For example: $ xfs_io -fc "pwrite `echo "2^63-1-1" | bc` 1" /mnt/file wrote 1/1 bytes at offset 9223372036854775806 1.000000 bytes, 1 ops; 0.0000 sec (22.711 KiB/sec and 23255.8140 ops/sec) $ stat -c %s /mnt/file 9223372036854775807 $ umount /mnt ; mount /mnt/ $ stat -c %s /mnt/file 0 This occurs because XFS calculates the new file size as io_offset + io_size, I/O occurs in block sized requests, and the maximum supported file size is not block aligned. Therefore, a write to the max allowable offset on a 4k blocksize fs results in a write of size 4k to offset 2^63-4096 (e.g., equivalent to round_down(2^63-1, 4096), or IOW the offset of the block that contains the max file size). The offset plus size calculation (2^63 - 4096 + 4096 == 2^63) overflows the signed 64-bit variable which goes negative and causes the > comparison to the on-disk inode size to fail. This returns 0 from xfs_new_eof() and results in no change to the inode on-disk. Update xfs_new_eof() to explicitly detect overflow of the local calculation and use the VFS inode size in this scenario. The VFS inode size is capped to the maximum and thus XFS writes the correct inode size to disk. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c10e3fadd9af..9af2882e1f4c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) { xfs_fsize_t i_size = i_size_read(VFS_I(ip)); - if (new_size > i_size) + if (new_size > i_size || new_size < 0) new_size = i_size; return new_size > ip->i_d.di_size ? new_size : 0; } From 6ee49a20c13b4b4e79a3bba406df8106cff284a1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Oct 2014 09:23:49 +1000 Subject: [PATCH 50/56] xfs: don't send null bp to xfs_trans_brelse() In this case, if bp is NULL, error is set, and we send a NULL bp to xfs_trans_brelse, which will try to dereference it. Test whether we actually have a buffer before we try to free it. Coverity spotted this. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_da_btree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2c42ae28d027..fd827530afec 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2563,7 +2563,8 @@ xfs_da_get_buf( mapp, nmap, 0); error = bp ? bp->b_error : -EIO; if (error) { - xfs_trans_brelse(trans, bp); + if (bp) + xfs_trans_brelse(trans, bp); goto out_free; } From 04dd1a0d4b17a71220eae4fb313218f15a49bcdd Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Oct 2014 09:24:11 +1000 Subject: [PATCH 51/56] xfs: fix crc field handling in xfs_sb_to/from_disk I discovered this in userspace, but the same change applies to the kernel. If we xfs_mdrestore an image from a non-crc filesystem, lo and behold the restored image has gained a CRC: # db/xfs_metadump.sh -o /dev/sdc1 - | xfs_mdrestore - test.img # xfs_db -c "sb 0" -c "p crc" /dev/sdc1 crc = 0 (correct) # xfs_db -c "sb 0" -c "p crc" test.img crc = 0xb6f8d6a0 (correct) This is because xfs_sb_from_disk doesn't fill in sb_crc, but xfs_sb_to_disk(XFS_SB_ALL_BITS) does write the in-memory CRC to disk - so we get uninitialized memory on disk. Fix this by always initializing sb_crc to 0 when we read the superblock, and masking out the CRC bit from ALL_BITS when we write it. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8426e5e2682e..5f902fa7913f 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -445,6 +445,8 @@ __xfs_sb_from_disk( to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); to->sb_features_log_incompat = be32_to_cpu(from->sb_features_log_incompat); + /* crc is only used on disk, not in memory; just init to 0 here. */ + to->sb_crc = 0; to->sb_pad = 0; to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); @@ -550,6 +552,9 @@ xfs_sb_to_disk( if (!fields) return; + /* We should never write the crc here, it's updated in the IO path */ + fields &= ~XFS_SB_CRC; + xfs_sb_quota_to_disk(to, from, &fields); while (fields) { f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); From 5cca3f611d159e5a4a5ec60413bd09948ef40aea Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Oct 2014 09:27:09 +1000 Subject: [PATCH 52/56] xfs: check for null dquot in xfs_quota_calc_throttle() Coverity spotted this. Granted, we *just* checked xfs_inod_dquot() in the caller (by calling xfs_quota_need_throttle). However, this is the only place we don't check the return value but the check is cheap and future-proof so add it. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e9c47b6f5e5a..afcf3c926565 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -404,8 +404,8 @@ xfs_quota_calc_throttle( int shift = 0; struct xfs_dquot *dq = xfs_inode_dquot(ip, type); - /* over hi wmark, squash the prealloc completely */ - if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + /* no dq, or over hi wmark, squash the prealloc completely */ + if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; From 07d08681d26e99d8ba3bc4e56380f2cc04d3ff3b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 2 Oct 2014 09:42:06 +1000 Subject: [PATCH 53/56] xfs: restore buffer_head unwritten bit on ioend cancel xfs_vm_writepage() walks each buffer_head on the page, maps to the block on disk and attaches to a running ioend structure that represents the I/O submission. A new ioend is created when the type of I/O (unwritten, delayed allocation or overwrite) required for a particular buffer_head differs from the previous. If a buffer_head is a delalloc or unwritten buffer, the associated bits are cleared by xfs_map_at_offset() once the buffer_head is added to the ioend. The process of mapping each buffer_head occurs in xfs_map_blocks() and acquires the ilock in blocking or non-blocking mode, depending on the type of writeback in progress. If the lock cannot be acquired for non-blocking writeback, we cancel the ioend, redirty the page and return. Writeback will revisit the page at some later point. Note that we acquire the ilock for each buffer on the page. Therefore during non-blocking writeback, it is possible to add an unwritten buffer to the ioend, clear the unwritten state, fail to acquire the ilock when mapping a subsequent buffer and cancel the ioend. If this occurs, the unwritten status of the buffer sitting in the ioend has been lost. The page will eventually hit writeback again, but xfs_vm_writepage() submits overwrite I/O instead of unwritten I/O and does not perform unwritten extent conversion at I/O completion. This leads to data corruption because unwritten extents are treated as holes on reads and zeroes are returned instead of reading from disk. Modify xfs_cancel_ioend() to restore the buffer unwritten bit for ioends of type XFS_IO_UNWRITTEN. This ensures that unwritten extent conversion occurs once the page is eventually written back. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 11e9b4caa54f..dc3e10882d1f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -548,6 +548,13 @@ xfs_cancel_ioend( do { next_bh = bh->b_private; clear_buffer_async_write(bh); + /* + * The unwritten flag is cleared when added to the + * ioend. We're not submitting for I/O so mark the + * buffer unwritten again for next time around. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) + set_buffer_unwritten(bh); unlock_buffer(bh); } while ((bh = next_bh) != NULL); From da5f10969d54006a24777a84ed3eaeeb2a21047f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 2 Oct 2014 09:44:54 +1000 Subject: [PATCH 54/56] xfs: flush the range before zero range conversion XFS currently discards delalloc blocks within the target range of a zero range request. Unaligned start and end offsets are zeroed through the page cache and the internal, aligned blocks are converted to unwritten extents. If EOF is page aligned and covered by a delayed allocation extent. The inode size is not updated until I/O completion. If a zero range request discards a delalloc range that covers page aligned EOF as such, the inode size update never occurs. For example: $ rm -f /mnt/file $ xfs_io -fc "pwrite 0 64k" -c "zero 60k 4k" /mnt/file $ stat -c "%s" /mnt/file 65536 $ umount /mnt $ mount /mnt $ stat -c "%s" /mnt/file 61440 Update xfs_zero_file_space() to flush the range rather than discard delalloc blocks to ensure that inode size updates occur appropriately. [dchinner: Note that this is really a workaround to avoid the underlying problems. More work is needed (and ongoing) to fix those issues so this fix is being added as a temporary stop-gap measure. ] Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1cb345e7c181..6f5cb63bed17 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1392,14 +1392,14 @@ xfs_zero_file_space( if (start_boundary < end_boundary - 1) { /* - * punch out delayed allocation blocks and the page cache over - * the conversion range + * Writeback the range to ensure any inode size updates due to + * appending writes make it to disk (otherwise we could just + * punch out the delalloc blocks). */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, start_boundary), - XFS_B_TO_FSB(mp, end_boundary - start_boundary)); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + start_boundary, end_boundary - 1); + if (error) + goto out; truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); From 52177937e9ac4573391143065b250403d3a6ae4b Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Fri, 3 Oct 2014 09:09:50 +1000 Subject: [PATCH 55/56] xfs: xfs_iflush_done checks the wrong log item callback Commit 3013683 ("xfs: remove all the inodes on a buffer from the AIL in bulk") made the xfs inode flush callback more efficient by combining all the inode writes on the buffer and the deletions of the inode log item from AIL. The initial loop in this patch should be looping through all the log items on the buffer to see which items have xfs_iflush_done as their callback function. But currently, only the log item passed to the function has its callback compared to xfs_iflush_done. If the log item pointer passed to the function does have the xfs_iflush_done callback function, then all the log items on the buffer are removed from the li_bio_list on the buffer b_fspriv and could be removed from the AIL even though they may have not been written yet. This problem is masked by the fact that currently all inodes on a buffer will have the same calback function - either xfs_iflush_done or xfs_istale_done - and hence the bug cannot manifest in any way. Still, we need to remove the landmine so that if we add new callbacks in future this doesn't cause us problems. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index de5a7be36e60..63de0b0acc32 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -615,7 +615,7 @@ xfs_iflush_done( blip = bp->b_fspriv; prev = NULL; while (blip != NULL) { - if (lip->li_cb != xfs_iflush_done) { + if (blip->li_cb != xfs_iflush_done) { prev = blip; blip = blip->li_bio_list; continue; From a8b1ee8bafc765ebf029d03c5479a69aebff9693 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 13 Oct 2014 10:21:53 +1100 Subject: [PATCH 56/56] xfs: fix agno increment in xfs_inumbers() loop caused a regression in xfs_inumbers, which in turn broke xfsdump, causing incomplete dumps. The loop in xfs_inumbers() needs to fill the user-supplied buffers, and iterates via xfs_btree_increment, reading new ags as needed. But the first time through the loop, if xfs_btree_increment() succeeds, we continue, which triggers the ++agno at the bottom of the loop, and we skip to soon to the next ag - without the proper setup under next_ag to read the next ag. Fix this by removing the agno increment from the loop conditional, and only increment agno if we have actually hit the code under the next_ag: target. Cc: stable@vger.kernel.org Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f71be9c68017..f1deb961a296 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -639,7 +639,8 @@ next_ag: xfs_buf_relse(agbp); agbp = NULL; agino = 0; - } while (++agno < mp->m_sb.sb_agcount); + agno++; + } while (agno < mp->m_sb.sb_agcount); if (!error) { if (bufidx) {