linux/fs/ubifs/dir.c

1729 lines
44 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/* * This file is part of UBIFS.
*
* Copyright (C) 2006-2008 Nokia Corporation.
* Copyright (C) 2006, 2007 University of Szeged, Hungary
*
* Authors: Artem Bityutskiy (Битюцкий Артём)
* Adrian Hunter
* Zoltan Sogor
*/
/*
* This file implements directory operations.
*
* All FS operations in this file allocate budget before writing anything to the
* media. If they fail to allocate it, the error is returned. The only
* exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
* if they unable to allocate the budget, because deletion %-ENOSPC failure is
* not what users are usually ready to get. UBIFS budgeting subsystem has some
* space reserved for these purposes.
*
* All operations in this file write all inodes which they change straight
* away, instead of marking them dirty. For example, 'ubifs_link()' changes
* @i_size of the parent inode and writes the parent inode together with the
* target inode. This was done to simplify file-system recovery which would
* otherwise be very difficult to do. The only exception is rename which marks
* the re-named inode dirty (because its @i_ctime is updated) but does not
* write it, but just marks it as dirty.
*/
#include "ubifs.h"
/**
* inherit_flags - inherit flags of the parent inode.
* @dir: parent inode
* @mode: new inode mode flags
*
* This is a helper function for 'ubifs_new_inode()' which inherits flag of the
* parent directory inode @dir. UBIFS inodes inherit the following flags:
* o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
* sub-directory basis;
* o %UBIFS_SYNC_FL - useful for the same reasons;
* o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
*
* This function returns the inherited flags.
*/
static int inherit_flags(const struct inode *dir, umode_t mode)
{
int flags;
const struct ubifs_inode *ui = ubifs_inode(dir);
if (!S_ISDIR(dir->i_mode))
/*
* The parent is not a directory, which means that an extended
* attribute inode is being created. No flags.
*/
return 0;
flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
if (!S_ISDIR(mode))
/* The "DIRSYNC" flag only applies to directories */
flags &= ~UBIFS_DIRSYNC_FL;
return flags;
}
/**
* ubifs_new_inode - allocate new UBIFS inode object.
* @c: UBIFS file-system description object
* @dir: parent directory inode
* @mode: inode mode flags
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
* @is_xattr: whether the inode is xattr inode
*
* This function finds an unused inode number, allocates new inode and
* initializes it. Returns new inode in case of success and an error code in
* case of failure.
*/
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
umode_t mode, bool is_xattr)
{
int err;
struct inode *inode;
struct ubifs_inode *ui;
bool encrypted = false;
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
if (!inode)
return ERR_PTR(-ENOMEM);
/*
* Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
* marking them dirty in file write path (see 'file_update_time()').
* UBIFS has to fully control "clean <-> dirty" transitions of inodes
* to make budgeting work.
*/
inode->i_flags |= S_NOCMTIME;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
simple_inode_init_ts(inode);
inode->i_mapping->nrpages = 0;
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
if (!is_xattr) {
err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
if (err) {
ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
goto out_iput;
}
}
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
inode->i_op = &ubifs_file_inode_operations;
inode->i_fop = &ubifs_file_operations;
break;
case S_IFDIR:
inode->i_op = &ubifs_dir_inode_operations;
inode->i_fop = &ubifs_dir_operations;
inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
break;
case S_IFLNK:
inode->i_op = &ubifs_symlink_inode_operations;
break;
case S_IFSOCK:
case S_IFIFO:
case S_IFBLK:
case S_IFCHR:
inode->i_op = &ubifs_file_inode_operations;
break;
default:
BUG();
}
ui->flags = inherit_flags(dir, mode);
ubifs_set_inode_flags(inode);
if (S_ISREG(mode))
ui->compr_type = c->default_compr;
else
ui->compr_type = UBIFS_COMPR_NONE;
ui->synced_i_size = 0;
spin_lock(&c->cnt_lock);
/* Inode number overflow is currently not supported */
if (c->highest_inum >= INUM_WARN_WATERMARK) {
if (c->highest_inum >= INUM_WATERMARK) {
spin_unlock(&c->cnt_lock);
UBIFS: extend debug/message capabilities In the case where we have more than one volumes on different UBI devices, it may be not that easy to tell which volume prints the messages. Add ubi number and volume id in ubifs_msg/warn/error to help debug. These two values are passed by struct ubifs_info. For those where ubifs_info is not initialized yet, ubifs_* is replaced by pr_*. For those where ubifs_info is not avaliable, ubifs_info is passed to the calling function as a const parameter. The output looks like, [ 95.444879] UBIFS (ubi0:1): background thread "ubifs_bgt0_1" started, PID 696 [ 95.484688] UBIFS (ubi0:1): UBIFS: mounted UBI device 0, volume 1, name "test1" [ 95.484694] UBIFS (ubi0:1): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.484699] UBIFS (ubi0:1): FS size: 30220288 bytes (28 MiB, 238 LEBs), journal size 1523712 bytes (1 MiB, 12 LEBs) [ 95.484703] UBIFS (ubi0:1): reserved for root: 1427378 bytes (1393 KiB) [ 95.484709] UBIFS (ubi0:1): media format: w4/r0 (latest is w4/r0), UUID 40DFFC0E-70BE-4193-8905-F7D6DFE60B17, small LPT model [ 95.489875] UBIFS (ubi1:0): background thread "ubifs_bgt1_0" started, PID 699 [ 95.529713] UBIFS (ubi1:0): UBIFS: mounted UBI device 1, volume 0, name "test2" [ 95.529718] UBIFS (ubi1:0): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.529724] UBIFS (ubi1:0): FS size: 19808256 bytes (18 MiB, 156 LEBs), journal size 1015809 bytes (0 MiB, 8 LEBs) [ 95.529727] UBIFS (ubi1:0): reserved for root: 935592 bytes (913 KiB) [ 95.529733] UBIFS (ubi1:0): media format: w4/r0 (latest is w4/r0), UUID EEB7779D-F419-4CA9-811B-831CAC7233D4, small LPT model [ 954.264767] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node type (255 but expected 6) [ 954.367030] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node at LEB 0:0, LEB mapping status 1 Signed-off-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
2015-03-20 10:39:42 +00:00
ubifs_err(c, "out of inode numbers");
err = -EINVAL;
goto out_iput;
}
ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
UBIFS: fix compilation warnings We print 'ino_t' type using '%lu' printk() placeholder, but this results in many warnings when compiling for Alpha platform. Fix this by adding (unsingned long) casts. Fixes these warnings: fs/ubifs/journal.c:693: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/journal.c:1131: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/dir.c:163: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/tnc.c:2680: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/tnc.c:2700: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/replay.c:1066: warning: format '%lu' expects type 'long unsigned int', but argument 7 has type 'ino_t' fs/ubifs/orphan.c:108: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:135: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:142: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:154: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:159: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:451: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:539: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:612: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:843: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/orphan.c:856: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/recovery.c:1438: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/recovery.c:1443: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/recovery.c:1475: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/recovery.c:1495: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:105: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:105: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:110: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:110: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:114: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:114: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:118: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:118: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t' fs/ubifs/debug.c:1591: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1671: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1674: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:1680: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1699: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:1788: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:1821: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:1833: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:1924: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1932: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1938: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1945: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1953: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1960: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1967: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1973: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1988: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t' fs/ubifs/debug.c:1991: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t' fs/ubifs/debug.c:2009: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ino_t' Reported-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
2008-10-29 10:08:43 +00:00
(unsigned long)c->highest_inum, INUM_WATERMARK);
}
inode->i_ino = ++c->highest_inum;
/*
* The creation sequence number remains with this inode for its
* lifetime. All nodes for this inode have a greater sequence number,
* and so it is possible to distinguish obsolete nodes belonging to a
* previous incarnation of the same inode number - for example, for the
* purpose of rebuilding the index.
*/
ui->creat_sqnum = ++c->max_sqnum;
spin_unlock(&c->cnt_lock);
if (encrypted) {
err = fscrypt_set_context(inode, NULL);
if (err) {
ubifs_err(c, "fscrypt_set_context failed: %i", err);
goto out_iput;
}
}
return inode;
out_iput:
make_bad_inode(inode);
iput(inode);
return ERR_PTR(err);
}
static int dbg_check_name(const struct ubifs_info *c,
const struct ubifs_dent_node *dent,
const struct fscrypt_name *nm)
{
if (!dbg_is_chk_gen(c))
return 0;
if (le16_to_cpu(dent->nlen) != fname_len(nm))
return -EINVAL;
if (memcmp(dent->name, fname_name(nm), fname_len(nm)))
return -EINVAL;
return 0;
}
static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int err;
union ubifs_key key;
struct inode *inode = NULL;
struct ubifs_dent_node *dent = NULL;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct fscrypt_name nm;
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
fscrypt: fix race where ->lookup() marks plaintext dentry as ciphertext ->lookup() in an encrypted directory begins as follows: 1. fscrypt_prepare_lookup(): a. Try to load the directory's encryption key. b. If the key is unavailable, mark the dentry as a ciphertext name via d_flags. 2. fscrypt_setup_filename(): a. Try to load the directory's encryption key. b. If the key is available, encrypt the name (treated as a plaintext name) to get the on-disk name. Otherwise decode the name (treated as a ciphertext name) to get the on-disk name. But if the key is concurrently added, it may be found at (2a) but not at (1a). In this case, the dentry will be wrongly marked as a ciphertext name even though it was actually treated as plaintext. This will cause the dentry to be wrongly invalidated on the next lookup, potentially causing problems. For example, if the racy ->lookup() was part of sys_mount(), then the new mount will be detached when anything tries to access it. This is despite the mountpoint having a plaintext path, which should remain valid now that the key was added. Of course, this is only possible if there's a userspace race. Still, the additional kernel-side race is confusing and unexpected. Close the kernel-side race by changing fscrypt_prepare_lookup() to also set the on-disk filename (step 2b), consistent with the d_flags update. Fixes: 28b4c263961c ("ext4 crypto: revalidate dentry after adding or removing the key") Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-03-20 18:39:13 +00:00
err = fscrypt_prepare_lookup(dir, dentry, &nm);
generic_set_encrypted_ci_d_ops(dentry);
fscrypt: fix race where ->lookup() marks plaintext dentry as ciphertext ->lookup() in an encrypted directory begins as follows: 1. fscrypt_prepare_lookup(): a. Try to load the directory's encryption key. b. If the key is unavailable, mark the dentry as a ciphertext name via d_flags. 2. fscrypt_setup_filename(): a. Try to load the directory's encryption key. b. If the key is available, encrypt the name (treated as a plaintext name) to get the on-disk name. Otherwise decode the name (treated as a ciphertext name) to get the on-disk name. But if the key is concurrently added, it may be found at (2a) but not at (1a). In this case, the dentry will be wrongly marked as a ciphertext name even though it was actually treated as plaintext. This will cause the dentry to be wrongly invalidated on the next lookup, potentially causing problems. For example, if the racy ->lookup() was part of sys_mount(), then the new mount will be detached when anything tries to access it. This is despite the mountpoint having a plaintext path, which should remain valid now that the key was added. Of course, this is only possible if there's a userspace race. Still, the additional kernel-side race is confusing and unexpected. Close the kernel-side race by changing fscrypt_prepare_lookup() to also set the on-disk filename (step 2b), consistent with the d_flags update. Fixes: 28b4c263961c ("ext4 crypto: revalidate dentry after adding or removing the key") Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2019-03-20 18:39:13 +00:00
if (err == -ENOENT)
return d_splice_alias(NULL, dentry);
if (err)
return ERR_PTR(err);
if (fname_len(&nm) > UBIFS_MAX_NLEN) {
inode = ERR_PTR(-ENAMETOOLONG);
goto done;
}
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
if (!dent) {
inode = ERR_PTR(-ENOMEM);
goto done;
}
if (fname_name(&nm) == NULL) {
if (nm.hash & ~UBIFS_S_KEY_HASH_MASK)
goto done; /* ENOENT */
dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
} else {
dent_key_init(c, &key, dir->i_ino, &nm);
err = ubifs_tnc_lookup_nm(c, &key, dent, &nm);
}
if (err) {
if (err == -ENOENT)
dbg_gen("not found");
else
inode = ERR_PTR(err);
goto done;
}
if (dbg_check_name(c, dent, &nm)) {
inode = ERR_PTR(-EINVAL);
goto done;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
if (IS_ERR(inode)) {
/*
* This should not happen. Probably the file-system needs
* checking.
*/
err = PTR_ERR(inode);
UBIFS: extend debug/message capabilities In the case where we have more than one volumes on different UBI devices, it may be not that easy to tell which volume prints the messages. Add ubi number and volume id in ubifs_msg/warn/error to help debug. These two values are passed by struct ubifs_info. For those where ubifs_info is not initialized yet, ubifs_* is replaced by pr_*. For those where ubifs_info is not avaliable, ubifs_info is passed to the calling function as a const parameter. The output looks like, [ 95.444879] UBIFS (ubi0:1): background thread "ubifs_bgt0_1" started, PID 696 [ 95.484688] UBIFS (ubi0:1): UBIFS: mounted UBI device 0, volume 1, name "test1" [ 95.484694] UBIFS (ubi0:1): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.484699] UBIFS (ubi0:1): FS size: 30220288 bytes (28 MiB, 238 LEBs), journal size 1523712 bytes (1 MiB, 12 LEBs) [ 95.484703] UBIFS (ubi0:1): reserved for root: 1427378 bytes (1393 KiB) [ 95.484709] UBIFS (ubi0:1): media format: w4/r0 (latest is w4/r0), UUID 40DFFC0E-70BE-4193-8905-F7D6DFE60B17, small LPT model [ 95.489875] UBIFS (ubi1:0): background thread "ubifs_bgt1_0" started, PID 699 [ 95.529713] UBIFS (ubi1:0): UBIFS: mounted UBI device 1, volume 0, name "test2" [ 95.529718] UBIFS (ubi1:0): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.529724] UBIFS (ubi1:0): FS size: 19808256 bytes (18 MiB, 156 LEBs), journal size 1015809 bytes (0 MiB, 8 LEBs) [ 95.529727] UBIFS (ubi1:0): reserved for root: 935592 bytes (913 KiB) [ 95.529733] UBIFS (ubi1:0): media format: w4/r0 (latest is w4/r0), UUID EEB7779D-F419-4CA9-811B-831CAC7233D4, small LPT model [ 954.264767] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node type (255 but expected 6) [ 954.367030] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node at LEB 0:0, LEB mapping status 1 Signed-off-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
2015-03-20 10:39:42 +00:00
ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
goto done;
}
if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
dir->i_ino, inode->i_ino);
iput(inode);
inode = ERR_PTR(-EPERM);
}
done:
kfree(dent);
fscrypt_free_filename(&nm);
return d_splice_alias(inode, dentry);
}
static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry,
struct fscrypt_name *nm)
{
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm);
}
static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct fscrypt_name nm;
int err, sz_change;
/*
* Budget request settings: new inode, new direntry, changing the
* parent directory inode.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
err = ubifs_budget_space(c, &req);
if (err)
return err;
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
fscrypt_free_filename(&nm);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
return 0;
out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
make_bad_inode(inode);
iput(inode);
out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
UBIFS: extend debug/message capabilities In the case where we have more than one volumes on different UBI devices, it may be not that easy to tell which volume prints the messages. Add ubi number and volume id in ubifs_msg/warn/error to help debug. These two values are passed by struct ubifs_info. For those where ubifs_info is not initialized yet, ubifs_* is replaced by pr_*. For those where ubifs_info is not avaliable, ubifs_info is passed to the calling function as a const parameter. The output looks like, [ 95.444879] UBIFS (ubi0:1): background thread "ubifs_bgt0_1" started, PID 696 [ 95.484688] UBIFS (ubi0:1): UBIFS: mounted UBI device 0, volume 1, name "test1" [ 95.484694] UBIFS (ubi0:1): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.484699] UBIFS (ubi0:1): FS size: 30220288 bytes (28 MiB, 238 LEBs), journal size 1523712 bytes (1 MiB, 12 LEBs) [ 95.484703] UBIFS (ubi0:1): reserved for root: 1427378 bytes (1393 KiB) [ 95.484709] UBIFS (ubi0:1): media format: w4/r0 (latest is w4/r0), UUID 40DFFC0E-70BE-4193-8905-F7D6DFE60B17, small LPT model [ 95.489875] UBIFS (ubi1:0): background thread "ubifs_bgt1_0" started, PID 699 [ 95.529713] UBIFS (ubi1:0): UBIFS: mounted UBI device 1, volume 0, name "test2" [ 95.529718] UBIFS (ubi1:0): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.529724] UBIFS (ubi1:0): FS size: 19808256 bytes (18 MiB, 156 LEBs), journal size 1015809 bytes (0 MiB, 8 LEBs) [ 95.529727] UBIFS (ubi1:0): reserved for root: 935592 bytes (913 KiB) [ 95.529733] UBIFS (ubi1:0): media format: w4/r0 (latest is w4/r0), UUID EEB7779D-F419-4CA9-811B-831CAC7233D4, small LPT model [ 954.264767] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node type (255 but expected 6) [ 954.367030] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node at LEB 0:0, LEB mapping status 1 Signed-off-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
2015-03-20 10:39:42 +00:00
ubifs_err(c, "cannot create regular file, error %d", err);
return err;
}
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
{
int err;
umode_t mode = S_IFCHR | WHITEOUT_MODE;
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
/*
* Create an inode('nlink = 1') for whiteout without updating journal,
* let ubifs_jnl_rename() store it on flash to complete rename whiteout
* atomically.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, mode, false);
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_free;
}
init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
/* The dir size is updated by do_rename. */
insert_inode_hash(inode);
return inode;
out_inode:
make_bad_inode(inode);
iput(inode);
out_free:
ubifs_err(c, "cannot create whiteout file, error %d", err);
return ERR_PTR(err);
}
ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:37 +00:00
/**
* lock_2_inodes - a wrapper for locking two UBIFS inodes.
* @inode1: first inode
* @inode2: second inode
*
* We do not implement any tricks to guarantee strict lock ordering, because
* VFS has already done it for us on the @i_mutex. So this is just a simple
* wrapper function.
*/
static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
{
mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
}
/**
* unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
* @inode1: first inode
* @inode2: second inode
*/
static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
{
mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
}
static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
struct file *file, umode_t mode)
{
struct dentry *dentry = file->f_path.dentry;
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:37 +00:00
struct ubifs_inode *ui;
int err, instantiated = 0;
struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry, changing the
* parent directory inode.
* Allocate budget separately for new dirtied inode, the budget will
* be released via writeback.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
err = ubifs_budget_space(c, &req);
if (err) {
fscrypt_free_filename(&nm);
return err;
}
err = ubifs_budget_space(c, &ino_req);
if (err) {
ubifs_release_budget(c, &req);
fscrypt_free_filename(&nm);
return err;
}
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_budg;
}
ui = ubifs_inode(inode);
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&ui->ui_mutex);
insert_inode_hash(inode);
d_tmpfile(file, inode);
ubifs_assert(c, ui->dirty);
instantiated = 1;
mutex_unlock(&ui->ui_mutex);
ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:37 +00:00
lock_2_inodes(dir, inode);
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:37 +00:00
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
fscrypt_free_filename(&nm);
return finish_open_simple(file, 0);
out_cancel:
ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work 'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty', finally dirty space is released twice. open(O_TMPFILE) wb_workfn do_tmpfile ubifs_budget_space(ino_req = { .dirtied_ino = 1}) d_tmpfile // mark inode(tmpfile) dirty ubifs_jnl_update // without holding tmpfile's ui_mutex mark_inode_clean(ui) if (ui->dirty) ubifs_release_dirty_inode_budget(ui) // release first time ubifs_write_inode mutex_lock(&ui->ui_mutex) ubifs_release_dirty_inode_budget(ui) // release second time mutex_unlock(&ui->ui_mutex) ui->dirty = 0 Run generic/476 can reproduce following message easily (See reproducer in [Link]): UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554 UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to read-only mode, error -22 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: ubifs_ro_mode+0x54/0x60 [ubifs] ubifs_assert_failed+0x4b/0x80 [ubifs] ubifs_release_budget+0x468/0x5a0 [ubifs] ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs] ubifs_write_inode+0x121/0x1f0 [ubifs] ... wb_workfn+0x283/0x7b0 Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update(). Similar problem exists in whiteout renaming, but previous fix("ubifs: Rename whiteout atomically") has solved the problem. Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:37 +00:00
unlock_2_inodes(dir, inode);
out_inode:
make_bad_inode(inode);
if (!instantiated)
iput(inode);
out_budg:
ubifs_release_budget(c, &req);
if (!instantiated)
ubifs_release_budget(c, &ino_req);
fscrypt_free_filename(&nm);
ubifs_err(c, "cannot create temporary file, error %d", err);
return err;
}
/**
* vfs_dent_type - get VFS directory entry type.
* @type: UBIFS directory entry type
*
* This function converts UBIFS directory entry type into VFS directory entry
* type.
*/
static unsigned int vfs_dent_type(uint8_t type)
{
switch (type) {
case UBIFS_ITYPE_REG:
return DT_REG;
case UBIFS_ITYPE_DIR:
return DT_DIR;
case UBIFS_ITYPE_LNK:
return DT_LNK;
case UBIFS_ITYPE_BLK:
return DT_BLK;
case UBIFS_ITYPE_CHR:
return DT_CHR;
case UBIFS_ITYPE_FIFO:
return DT_FIFO;
case UBIFS_ITYPE_SOCK:
return DT_SOCK;
default:
BUG();
}
return 0;
}
/*
* The classical Unix view for directory is that it is a linear array of
* (name, inode number) entries. Linux/VFS assumes this model as well.
* Particularly, 'readdir()' call wants us to return a directory entry offset
* which later may be used to continue 'readdir()'ing the directory or to
* 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
* model because directory entries are identified by keys, which may collide.
*
* UBIFS uses directory entry hash value for directory offsets, so
* 'seekdir()'/'telldir()' may not always work because of possible key
* collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
* properly by means of saving full directory entry name in the private field
* of the file description object.
*
* This means that UBIFS cannot support NFS which requires full
* 'seekdir()'/'telldir()' support.
*/
static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
int fstr_real_len = 0, err = 0;
struct fscrypt_name nm;
struct fscrypt_str fstr = {0};
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
bool encrypted = IS_ENCRYPTED(dir);
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
/*
* The directory was seek'ed to a senseless position or there
* are no more entries.
*/
return 0;
if (encrypted) {
err = fscrypt_prepare_readdir(dir);
if (err)
return err;
err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr);
if (err)
return err;
fstr_real_len = fstr.len;
}
if (file->f_version == 0) {
/*
* The file was seek'ed, which means that @file->private_data
* is now invalid. This may also be just the first
* 'ubifs_readdir()' invocation, in which case
* @file->private_data is NULL, and the below code is
* basically a no-op.
*/
kfree(file->private_data);
file->private_data = NULL;
}
/*
* 'generic_file_llseek()' unconditionally sets @file->f_version to
* zero, and we use this for detecting whether the file was seek'ed.
*/
file->f_version = 1;
/* File positions 0 and 1 correspond to "." and ".." */
if (ctx->pos < 2) {
ubifs_assert(c, !file->private_data);
if (!dir_emit_dots(file, ctx)) {
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
return 0;
}
/* Find the first entry in TNC and save it */
lowest_dent_key(c, &key, dir->i_ino);
fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
goto out;
}
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
dent = file->private_data;
if (!dent) {
/*
* The directory was seek'ed to and is now readdir'ed.
* Find the entry corresponding to @ctx->pos or the closest one.
*/
dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
goto out;
}
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
}
while (1) {
dbg_gen("ino %llu, new f_pos %#x",
(unsigned long long)le64_to_cpu(dent->inum),
key_hash_flash(c, &dent->key));
ubifs_assert(c, le64_to_cpu(dent->ch.sqnum) >
UBIFS: endian handling fixes and annotations Noticed by sparse: fs/ubifs/file.c:75:2: warning: restricted __le64 degrades to integer fs/ubifs/file.c:629:4: warning: restricted __le64 degrades to integer fs/ubifs/dir.c:431:3: warning: restricted __le64 degrades to integer This should be checked to ensure the ubifs_assert is working as intended, I've done the suggested annotation in this patch. fs/ubifs/sb.c:298:6: warning: incorrect type in assignment (different base types) fs/ubifs/sb.c:298:6: expected int [signed] [assigned] tmp fs/ubifs/sb.c:298:6: got restricted __le64 [usertype] <noident> fs/ubifs/sb.c:299:19: warning: incorrect type in assignment (different base types) fs/ubifs/sb.c:299:19: expected restricted __le64 [usertype] atime_sec fs/ubifs/sb.c:299:19: got int [signed] [assigned] tmp fs/ubifs/sb.c:300:19: warning: incorrect type in assignment (different base types) fs/ubifs/sb.c:300:19: expected restricted __le64 [usertype] ctime_sec fs/ubifs/sb.c:300:19: got int [signed] [assigned] tmp fs/ubifs/sb.c:301:19: warning: incorrect type in assignment (different base types) fs/ubifs/sb.c:301:19: expected restricted __le64 [usertype] mtime_sec fs/ubifs/sb.c:301:19: got int [signed] [assigned] tmp This looks like a bugfix as your tmp was a u32 so there was truncation in the atime, mtime, ctime value, probably not intentional, add a tmp_le64 and use it here. fs/ubifs/key.h:348:9: warning: cast to restricted __le32 fs/ubifs/key.h:348:9: warning: cast to restricted __le32 fs/ubifs/key.h:419:9: warning: cast to restricted __le32 Read from the annotated union member instead. fs/ubifs/recovery.c:175:13: warning: incorrect type in assignment (different base types) fs/ubifs/recovery.c:175:13: expected unsigned int [unsigned] [usertype] save_flags fs/ubifs/recovery.c:175:13: got restricted __le32 [usertype] flags fs/ubifs/recovery.c:186:13: warning: incorrect type in assignment (different base types) fs/ubifs/recovery.c:186:13: expected restricted __le32 [usertype] flags fs/ubifs/recovery.c:186:13: got unsigned int [unsigned] [usertype] save_flags Do byteshifting at compile time of the flag value. Annotate the saved_flags as le32. fs/ubifs/debug.c:368:10: warning: cast to restricted __le32 fs/ubifs/debug.c:368:10: warning: cast from restricted __le64 Should be checked if the truncation was intentional, I've changed the printk to print the full width. Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com> Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
2008-10-24 17:52:57 +00:00
ubifs_inode(dir)->creat_sqnum);
fname_len(&nm) = le16_to_cpu(dent->nlen);
fname_name(&nm) = dent->name;
if (encrypted) {
fstr.len = fstr_real_len;
err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c,
&dent->key),
le32_to_cpu(dent->cookie),
&nm.disk_name, &fstr);
if (err)
goto out;
} else {
fstr.len = fname_len(&nm);
fstr.name = fname_name(&nm);
}
if (!dir_emit(ctx, fstr.name, fstr.len,
le64_to_cpu(dent->inum),
vfs_dent_type(dent->type))) {
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
return 0;
}
/* Switch to the next entry */
key_read(c, &dent->key, &key);
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
goto out;
}
kfree(file->private_data);
ctx->pos = key_hash_flash(c, &dent->key);
file->private_data = dent;
cond_resched();
}
out:
kfree(file->private_data);
file->private_data = NULL;
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
if (err != -ENOENT)
UBIFS: extend debug/message capabilities In the case where we have more than one volumes on different UBI devices, it may be not that easy to tell which volume prints the messages. Add ubi number and volume id in ubifs_msg/warn/error to help debug. These two values are passed by struct ubifs_info. For those where ubifs_info is not initialized yet, ubifs_* is replaced by pr_*. For those where ubifs_info is not avaliable, ubifs_info is passed to the calling function as a const parameter. The output looks like, [ 95.444879] UBIFS (ubi0:1): background thread "ubifs_bgt0_1" started, PID 696 [ 95.484688] UBIFS (ubi0:1): UBIFS: mounted UBI device 0, volume 1, name "test1" [ 95.484694] UBIFS (ubi0:1): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.484699] UBIFS (ubi0:1): FS size: 30220288 bytes (28 MiB, 238 LEBs), journal size 1523712 bytes (1 MiB, 12 LEBs) [ 95.484703] UBIFS (ubi0:1): reserved for root: 1427378 bytes (1393 KiB) [ 95.484709] UBIFS (ubi0:1): media format: w4/r0 (latest is w4/r0), UUID 40DFFC0E-70BE-4193-8905-F7D6DFE60B17, small LPT model [ 95.489875] UBIFS (ubi1:0): background thread "ubifs_bgt1_0" started, PID 699 [ 95.529713] UBIFS (ubi1:0): UBIFS: mounted UBI device 1, volume 0, name "test2" [ 95.529718] UBIFS (ubi1:0): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.529724] UBIFS (ubi1:0): FS size: 19808256 bytes (18 MiB, 156 LEBs), journal size 1015809 bytes (0 MiB, 8 LEBs) [ 95.529727] UBIFS (ubi1:0): reserved for root: 935592 bytes (913 KiB) [ 95.529733] UBIFS (ubi1:0): media format: w4/r0 (latest is w4/r0), UUID EEB7779D-F419-4CA9-811B-831CAC7233D4, small LPT model [ 954.264767] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node type (255 but expected 6) [ 954.367030] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node at LEB 0:0, LEB mapping status 1 Signed-off-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
2015-03-20 10:39:42 +00:00
ubifs_err(c, "cannot find next direntry, error %d", err);
else
/*
* -ENOENT is a non-fatal error in this context, the TNC uses
* it to indicate that the cursor moved past the current directory
* and readdir() has to stop.
*/
err = 0;
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
return err;
}
/* Free saved readdir() state when the directory is closed */
static int ubifs_dir_release(struct inode *dir, struct file *file)
{
kfree(file->private_data);
file->private_data = NULL;
return 0;
}
static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(old_dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
int err, sz_change;
struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
struct fscrypt_name nm;
/*
* Budget request settings: new direntry, changing the target inode,
* changing the parent inode.
*/
dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
ubifs_assert(c, inode_is_locked(dir));
ubifs_assert(c, inode_is_locked(inode));
err = fscrypt_prepare_link(old_dentry, dir, dentry);
if (err)
return err;
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
err = dbg_check_synced_i_size(c, inode);
if (err)
goto out_fname;
err = ubifs_budget_space(c, &req);
if (err)
goto out_fname;
lock_2_inodes(dir, inode);
/* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
if (inode->i_nlink == 0)
ubifs_delete_orphan(c, inode->i_ino);
inc_nlink(inode);
ihold(inode);
inode_set_ctime_current(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
return 0;
out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
drop_nlink(inode);
if (inode->i_nlink == 0)
ubifs_add_orphan(c, inode->i_ino);
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
iput(inode);
out_fname:
fscrypt_free_filename(&nm);
return err;
}
static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
int err, sz_change, budgeted = 1;
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
unsigned int saved_nlink = inode->i_nlink;
struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode (+1 for
* @dirtied_ino), changing the parent directory inode. If budgeting
* fails, go ahead anyway because we have extra space reserved for
* deletions.
*/
dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
if (err)
return err;
err = ubifs_purge_xattrs(inode);
if (err)
return err;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
ubifs_assert(c, inode_is_locked(dir));
ubifs_assert(c, inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
goto out_fname;
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
goto out_fname;
budgeted = 0;
}
lock_2_inodes(dir, inode);
inode_set_ctime_current(inode);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
fscrypt_free_filename(&nm);
return 0;
out_cancel:
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
set_nlink(inode, saved_nlink);
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
out_fname:
fscrypt_free_filename(&nm);
return err;
}
/**
* ubifs_check_dir_empty - check if a directory is empty or not.
* @dir: VFS inode object of the directory to check
*
* This function checks if directory @dir is empty. Returns zero if the
* directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
* in case of errors.
*/
int ubifs_check_dir_empty(struct inode *dir)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct fscrypt_name nm = { 0 };
struct ubifs_dent_node *dent;
union ubifs_key key;
int err;
lowest_dent_key(c, &key, dir->i_ino);
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
if (err == -ENOENT)
err = 0;
} else {
kfree(dent);
err = -ENOTEMPTY;
}
return err;
}
static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
int err, sz_change, budgeted = 1;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode and
* changing the parent inode. If budgeting fails, go ahead anyway
* because we have extra space reserved for deletions.
*/
dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
inode->i_ino, dir->i_ino);
ubifs_assert(c, inode_is_locked(dir));
ubifs_assert(c, inode_is_locked(inode));
err = ubifs_check_dir_empty(d_inode(dentry));
if (err)
return err;
err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
if (err)
return err;
err = ubifs_purge_xattrs(inode);
if (err)
return err;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
goto out_fname;
budgeted = 0;
}
lock_2_inodes(dir, inode);
inode_set_ctime_current(inode);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
fscrypt_free_filename(&nm);
return 0;
out_cancel:
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inc_nlink(dir);
set_nlink(inode, 2);
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
out_fname:
fscrypt_free_filename(&nm);
return err;
}
static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1};
struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
* directory inode.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
err = ubifs_budget_space(c, &req);
if (err)
return err;
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
insert_inode_hash(inode);
inc_nlink(inode);
inc_nlink(dir);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
UBIFS: extend debug/message capabilities In the case where we have more than one volumes on different UBI devices, it may be not that easy to tell which volume prints the messages. Add ubi number and volume id in ubifs_msg/warn/error to help debug. These two values are passed by struct ubifs_info. For those where ubifs_info is not initialized yet, ubifs_* is replaced by pr_*. For those where ubifs_info is not avaliable, ubifs_info is passed to the calling function as a const parameter. The output looks like, [ 95.444879] UBIFS (ubi0:1): background thread "ubifs_bgt0_1" started, PID 696 [ 95.484688] UBIFS (ubi0:1): UBIFS: mounted UBI device 0, volume 1, name "test1" [ 95.484694] UBIFS (ubi0:1): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.484699] UBIFS (ubi0:1): FS size: 30220288 bytes (28 MiB, 238 LEBs), journal size 1523712 bytes (1 MiB, 12 LEBs) [ 95.484703] UBIFS (ubi0:1): reserved for root: 1427378 bytes (1393 KiB) [ 95.484709] UBIFS (ubi0:1): media format: w4/r0 (latest is w4/r0), UUID 40DFFC0E-70BE-4193-8905-F7D6DFE60B17, small LPT model [ 95.489875] UBIFS (ubi1:0): background thread "ubifs_bgt1_0" started, PID 699 [ 95.529713] UBIFS (ubi1:0): UBIFS: mounted UBI device 1, volume 0, name "test2" [ 95.529718] UBIFS (ubi1:0): LEB size: 126976 bytes (124 KiB), min./max. I/O unit sizes: 2048 bytes/2048 bytes [ 95.529724] UBIFS (ubi1:0): FS size: 19808256 bytes (18 MiB, 156 LEBs), journal size 1015809 bytes (0 MiB, 8 LEBs) [ 95.529727] UBIFS (ubi1:0): reserved for root: 935592 bytes (913 KiB) [ 95.529733] UBIFS (ubi1:0): media format: w4/r0 (latest is w4/r0), UUID EEB7779D-F419-4CA9-811B-831CAC7233D4, small LPT model [ 954.264767] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node type (255 but expected 6) [ 954.367030] UBIFS error (ubi1:0 pid 756): ubifs_read_node: bad node at LEB 0:0, LEB mapping status 1 Signed-off-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
2015-03-20 10:39:42 +00:00
ubifs_err(c, "cannot create directory, error %d", err);
goto out_cancel;
}
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
return 0;
out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
drop_nlink(dir);
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
make_bad_inode(inode);
iput(inode);
out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
}
static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct inode *inode;
struct ubifs_inode *ui;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
union ubifs_dev_desc *dev = NULL;
int sz_change;
int err, devlen = 0;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
* directory inode.
*/
dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
if (S_ISBLK(mode) || S_ISCHR(mode)) {
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev)
return -ENOMEM;
devlen = ubifs_encode_dev(dev, rdev);
}
req.new_ino_d = ALIGN(devlen, 8);
err = ubifs_budget_space(c, &req);
if (err) {
kfree(dev);
return err;
}
err = ubifs_prepare_create(dir, dentry, &nm);
if (err) {
kfree(dev);
goto out_budg;
}
sz_change = CALC_DENT_SIZE(fname_len(&nm));
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
kfree(dev);
err = PTR_ERR(inode);
goto out_fname;
}
init_special_inode(inode, inode->i_mode, rdev);
inode->i_size = ubifs_inode(inode)->ui_size = devlen;
ui = ubifs_inode(inode);
ui->data = dev;
ui->data_len = devlen;
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
return 0;
out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
make_bad_inode(inode);
iput(inode);
out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
}
static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *symname)
{
struct inode *inode;
struct ubifs_inode *ui;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change, len = strlen(symname);
struct fscrypt_str disk_link;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct fscrypt_name nm;
dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
symname, dir->i_ino);
err = fscrypt_prepare_symlink(dir, symname, len, UBIFS_MAX_INO_DATA,
&disk_link);
if (err)
return err;
/*
* Budget request settings: new inode, new direntry and changing parent
* directory inode.
*/
req.new_ino_d = ALIGN(disk_link.len - 1, 8);
err = ubifs_budget_space(c, &req);
if (err)
return err;
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
sz_change = CALC_DENT_SIZE(fname_len(&nm));
ubifs: Fix AA deadlock when setting xattr for encrypted file Following process: vfs_setxattr(host) ubifs_xattr_set down_write(host_ui->xattr_sem) <- lock first time create_xattr ubifs_new_inode(host) fscrypt_prepare_new_inode(host) fscrypt_policy_to_inherit(host) if (IS_ENCRYPTED(inode)) fscrypt_require_key(host) fscrypt_get_encryption_info(host) ubifs_xattr_get(host) down_read(host_ui->xattr_sem) <- AA deadlock , which may trigger an AA deadlock problem: [ 102.620871] INFO: task setfattr:1599 blocked for more than 10 seconds. [ 102.625298] Not tainted 5.19.0-rc7-00001-gb666b6823ce0-dirty #711 [ 102.628732] task:setfattr state:D stack: 0 pid: 1599 [ 102.628749] Call Trace: [ 102.628753] <TASK> [ 102.628776] __schedule+0x482/0x1060 [ 102.629964] schedule+0x92/0x1a0 [ 102.629976] rwsem_down_read_slowpath+0x287/0x8c0 [ 102.629996] down_read+0x84/0x170 [ 102.630585] ubifs_xattr_get+0xd1/0x370 [ubifs] [ 102.630730] ubifs_crypt_get_context+0x1f/0x30 [ubifs] [ 102.630791] fscrypt_get_encryption_info+0x7d/0x1c0 [ 102.630810] fscrypt_policy_to_inherit+0x56/0xc0 [ 102.630817] fscrypt_prepare_new_inode+0x35/0x160 [ 102.630830] ubifs_new_inode+0xcc/0x4b0 [ubifs] [ 102.630873] ubifs_xattr_set+0x591/0x9f0 [ubifs] [ 102.630961] xattr_set+0x8c/0x3e0 [ubifs] [ 102.631003] __vfs_setxattr+0x71/0xc0 [ 102.631026] vfs_setxattr+0x105/0x270 [ 102.631034] do_setxattr+0x6d/0x110 [ 102.631041] setxattr+0xa0/0xd0 [ 102.631087] __x64_sys_setxattr+0x2f/0x40 Fetch a reproducer in [Link]. Just like ext4 does, which skips encrypting for inode with EXT4_EA_INODE_FL flag. Stop encypting xattr inode for ubifs. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216260 Fixes: f4e3634a3b64222 ("ubifs: Fix races between xattr_{set|get} ...") Fixes: d475a507457b5ca ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2022-07-19 08:00:17 +00:00
inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out_fname;
}
ui = ubifs_inode(inode);
ui->data = kmalloc(disk_link.len, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_inode;
}
if (IS_ENCRYPTED(inode)) {
disk_link.name = ui->data; /* encrypt directly into ui->data */
err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
if (err)
goto out_inode;
} else {
memcpy(ui->data, disk_link.name, disk_link.len);
inode->i_link = ui->data;
}
/*
* The terminating zero byte is not written to the flash media and it
* is put just to make later in-memory string processing simpler. Thus,
* data length is @disk_link.len - 1, not @disk_link.len.
*/
ui->data_len = disk_link.len - 1;
inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
inode_set_mtime_to_ts(dir,
inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
err = 0;
goto out_fname;
out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
2023-12-22 08:54:46 +00:00
/* Free inode->i_link before inode is marked as bad. */
fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
}
/**
* lock_4_inodes - a wrapper for locking three UBIFS inodes.
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
* @inode4: fourth inode
*
* This function is used for 'ubifs_rename()' and @inode1 may be the same as
* @inode2 whereas @inode3 and @inode4 may be %NULL.
*
* We do not implement any tricks to guarantee strict lock ordering, because
* VFS has already done it for us on the @i_mutex. So this is just a simple
* wrapper function.
*/
static void lock_4_inodes(struct inode *inode1, struct inode *inode2,
struct inode *inode3, struct inode *inode4)
{
mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
if (inode2 != inode1)
mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
if (inode3)
mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
if (inode4)
mutex_lock_nested(&ubifs_inode(inode4)->ui_mutex, WB_MUTEX_4);
}
/**
* unlock_4_inodes - a wrapper for unlocking three UBIFS inodes for rename.
* @inode1: first inode
* @inode2: second inode
* @inode3: third inode
* @inode4: fourth inode
*/
static void unlock_4_inodes(struct inode *inode1, struct inode *inode2,
struct inode *inode3, struct inode *inode4)
{
if (inode4)
mutex_unlock(&ubifs_inode(inode4)->ui_mutex);
if (inode3)
mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
if (inode1 != inode2)
mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
}
static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
struct ubifs_info *c = old_dir->i_sb->s_fs_info;
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
struct inode *whiteout = NULL;
struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
struct ubifs_inode *whiteout_ui = NULL;
int err, release, sync = 0, move = (new_dir != old_dir);
int is_dir = S_ISDIR(old_inode->i_mode);
int unlink = !!new_inode, new_sz, old_sz;
struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
.dirtied_ino = 3 };
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
struct ubifs_budget_req wht_req;
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-03 20:09:38 +00:00
unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
/*
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
* Budget request settings:
* req: deletion direntry, new direntry, removing the old inode,
* and changing old and new parent directory inodes.
*
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
* wht_req: new whiteout inode for RENAME_WHITEOUT.
*
* ino_req: marks the target inode as dirty and does not write it.
*/
dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
old_dentry, old_inode->i_ino, old_dir->i_ino,
new_dentry, new_dir->i_ino, flags);
if (unlink) {
ubifs_assert(c, inode_is_locked(new_inode));
/* Budget for old inode's data when its nlink > 1. */
req.dirtied_ino_d = ALIGN(ubifs_inode(new_inode)->data_len, 8);
err = ubifs_purge_xattrs(new_inode);
if (err)
return err;
}
if (unlink && is_dir) {
err = ubifs_check_dir_empty(new_inode);
if (err)
return err;
}
err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm);
if (err)
return err;
err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm);
if (err) {
fscrypt_free_filename(&old_nm);
return err;
}
new_sz = CALC_DENT_SIZE(fname_len(&new_nm));
old_sz = CALC_DENT_SIZE(fname_len(&old_nm));
err = ubifs_budget_space(c, &req);
if (err) {
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
return err;
}
err = ubifs_budget_space(c, &ino_req);
if (err) {
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
ubifs_release_budget(c, &req);
return err;
}
if (flags & RENAME_WHITEOUT) {
union ubifs_dev_desc *dev = NULL;
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev) {
err = -ENOMEM;
goto out_release;
}
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
/*
* The whiteout inode without dentry is pinned in memory,
* umount won't happen during rename process because we
* got parent dentry.
*/
whiteout = create_whiteout(old_dir, old_dentry);
if (IS_ERR(whiteout)) {
err = PTR_ERR(whiteout);
kfree(dev);
goto out_release;
}
whiteout_ui = ubifs_inode(whiteout);
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
ubifs_assert(c, !whiteout_ui->dirty);
ubifs: Fix deadlock in concurrent rename whiteout and inode writeback Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:33 +00:00
memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
wht_req.new_ino = 1;
wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8);
ubifs: Fix deadlock in concurrent rename whiteout and inode writeback Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:33 +00:00
/*
* To avoid deadlock between space budget (holds ui_mutex and
* waits wb work) and writeback work(waits ui_mutex), do space
* budget before ubifs inodes locked.
*/
err = ubifs_budget_space(c, &wht_req);
if (err) {
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
/*
* Whiteout inode can not be written on flash by
* ubifs_jnl_write_inode(), because it's neither
* dirty nor zero-nlink.
*/
ubifs: Fix deadlock in concurrent rename whiteout and inode writeback Following hung tasks: [ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132 [ 77.028820] Call Trace: [ 77.029027] schedule+0x8c/0x1b0 [ 77.029067] mutex_lock+0x50/0x60 [ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs] [ 77.029117] __writeback_single_inode+0x43c/0x570 [ 77.029128] writeback_sb_inodes+0x259/0x740 [ 77.029148] wb_writeback+0x107/0x4d0 [ 77.029163] wb_workfn+0x162/0x7b0 [ 92.390442] task:aa state:D stack: 0 pid: 1506 [ 92.390448] Call Trace: [ 92.390458] schedule+0x8c/0x1b0 [ 92.390461] wb_wait_for_completion+0x82/0xd0 [ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110 [ 92.390472] writeback_inodes_sb_nr+0x14/0x20 [ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs] [ 92.390503] do_rename.cold+0x7f/0x187 [ubifs] [ 92.390549] ubifs_rename+0x8b/0x180 [ubifs] [ 92.390571] vfs_rename+0xdb2/0x1170 [ 92.390580] do_renameat2+0x554/0x770 , are caused by concurrent rename whiteout and inode writeback processes: rename_whiteout(Thread 1) wb_workfn(Thread2) ubifs_rename do_rename lock_4_inodes (Hold ui_mutex) ubifs_budget_space make_free_space shrink_liability __writeback_inodes_sb_nr bdi_split_work_to_wbs (Queue new wb work) wb_do_writeback(wb work) __writeback_single_inode ubifs_write_inode LOCK(ui_mutex) ↑ wb_wait_for_completion (Wait wb work) <-- deadlock! Reproducer (Detail program in [Link]): 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT) 2. Consume out of space before kernel(mdelay) doing budget for whiteout Fix it by doing whiteout space budget before locking ubifs inodes. BTW, it also fixes wrong goto tag 'out_release' in whiteout budget error handling path(It should at least recover dir i_size and unlock 4 ubifs inodes). Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733 Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:33 +00:00
iput(whiteout);
goto out_release;
}
/* Add the old_dentry size to the old_dir size. */
old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
/*
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
/* We must adjust parent link count when renaming directories */
if (is_dir) {
if (move) {
/*
* @old_dir loses a link because we are moving
* @old_inode to a different directory.
*/
drop_nlink(old_dir);
/*
* @new_dir only gains a link if we are not also
* overwriting an existing directory.
*/
if (!unlink)
inc_nlink(new_dir);
} else {
/*
* @old_inode is not moving to a different directory,
* but @old_dir still loses a link if we are
* overwriting an existing directory.
*/
if (unlink)
drop_nlink(old_dir);
}
}
old_dir->i_size -= old_sz;
ubifs_inode(old_dir)->ui_size = old_dir->i_size;
/*
* And finally, if we unlinked a direntry which happened to have the
* same name as the moved direntry, we have to decrement @i_nlink of
* the unlinked inode.
*/
if (unlink) {
/*
* Directories cannot have hard-links, so if this is a
* directory, just clear @i_nlink.
*/
saved_nlink = new_inode->i_nlink;
if (is_dir)
clear_nlink(new_inode);
else
drop_nlink(new_inode);
} else {
new_dir->i_size += new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
}
/*
* Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
* is dirty, because this will be done later on at the end of
* 'ubifs_rename()'.
*/
if (IS_SYNC(old_inode)) {
sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
if (unlink && IS_SYNC(new_inode))
sync = 1;
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
/*
* S_SYNC flag of whiteout inherits from the old_dir, and we
* have already checked the old dir inode. So there is no need
* to check whiteout.
*/
}
err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
new_inode, &new_nm, whiteout, sync);
if (err)
goto out_cancel;
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &req);
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
if (whiteout) {
ubifs_release_budget(c, &wht_req);
iput(whiteout);
}
mutex_lock(&old_inode_ui->ui_mutex);
release = old_inode_ui->dirty;
mark_inode_dirty_sync(old_inode);
mutex_unlock(&old_inode_ui->ui_mutex);
if (release)
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
/*
* Rename finished here. Although old inode cannot be updated
* on flash, old ctime is not a big problem, don't return err
* code to userspace.
*/
old_inode->i_sb->s_op->write_inode(old_inode, NULL);
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
return 0;
out_cancel:
if (unlink) {
set_nlink(new_inode, saved_nlink);
} else {
new_dir->i_size -= new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
}
old_dir->i_size += old_sz;
ubifs_inode(old_dir)->ui_size = old_dir->i_size;
if (is_dir) {
if (move) {
inc_nlink(old_dir);
if (!unlink)
drop_nlink(new_dir);
} else {
if (unlink)
inc_nlink(old_dir);
}
}
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
if (whiteout) {
ubifs: Rename whiteout atomically Currently, rename whiteout has 3 steps: 1. create tmpfile(which associates old dentry to tmpfile inode) for whiteout, and store tmpfile to disk 2. link whiteout, associate whiteout inode to old dentry agagin and store old dentry, old inode, new dentry on disk 3. writeback dirty whiteout inode to disk Suddenly power-cut or error occurring(eg. ENOSPC returned by budget, memory allocation failure) during above steps may cause kinds of problems: Problem 1: ENOSPC returned by whiteout space budget (before step 2), old dentry will disappear after rename syscall, whiteout file cannot be found either. ls dir // we get file, whiteout rename(dir/file, dir/whiteout, REANME_WHITEOUT) ENOSPC = ubifs_budget_space(&wht_req) // return ls dir // empty (no file, no whiteout) Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1' is not stored on disk, whiteout dentry(old dentry) is written on disk, whiteout file is lost on next mount (We get "dead directory entry" after executing 'ls -l' on whiteout file). Now, we use following 3 steps to finish rename whiteout: 1. create an in-mem inode with 'nlink = 1' as whiteout 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to whiteout inode, associating new dentry with old inode) 3. iput(whiteout) Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename whiteout, which avoids middle disk state caused by suddenly power-cut and error occurring. Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT") Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com> Signed-off-by: Richard Weinberger <richard@nod.at>
2021-12-27 03:22:36 +00:00
ubifs_release_budget(c, &wht_req);
iput(whiteout);
}
out_release:
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
fscrypt_free_filename(&old_nm);
fscrypt_free_filename(&new_nm);
return err;
}
static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct ubifs_info *c = old_dir->i_sb->s_fs_info;
struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
.dirtied_ino = 2 };
int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
struct inode *fst_inode = d_inode(old_dentry);
struct inode *snd_inode = d_inode(new_dentry);
int err;
struct fscrypt_name fst_nm, snd_nm;
ubifs_assert(c, fst_inode && snd_inode);
/*
* Budget request settings: changing two direntries, changing the two
* parent directory inodes.
*/
dbg_gen("dent '%pd' ino %lu in dir ino %lu exchange dent '%pd' ino %lu in dir ino %lu",
old_dentry, fst_inode->i_ino, old_dir->i_ino,
new_dentry, snd_inode->i_ino, new_dir->i_ino);
err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
if (err)
return err;
err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm);
if (err) {
fscrypt_free_filename(&fst_nm);
return err;
}
err = ubifs_budget_space(c, &req);
if (err)
goto out;
lock_4_inodes(old_dir, new_dir, NULL, NULL);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dir != new_dir) {
if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) {
inc_nlink(new_dir);
drop_nlink(old_dir);
}
else if (!S_ISDIR(fst_inode->i_mode) && S_ISDIR(snd_inode->i_mode)) {
drop_nlink(new_dir);
inc_nlink(old_dir);
}
}
err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir,
snd_inode, &snd_nm, sync);
unlock_4_inodes(old_dir, new_dir, NULL, NULL);
ubifs_release_budget(c, &req);
out:
fscrypt_free_filename(&fst_nm);
fscrypt_free_filename(&snd_nm);
return err;
}
static int ubifs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
int err;
struct ubifs_info *c = old_dir->i_sb->s_fs_info;
if (flags & ~(RENAME_NOREPLACE | RENAME_WHITEOUT | RENAME_EXCHANGE))
return -EINVAL;
ubifs_assert(c, inode_is_locked(old_dir));
ubifs_assert(c, inode_is_locked(new_dir));
err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
flags);
if (err)
return err;
if (flags & RENAME_EXCHANGE)
return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry);
return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
loff_t size;
statx: Add a system call to make enhanced file info available Add a system call to make extended file information available, including file creation and some attribute flags where available through the underlying filesystem. The getattr inode operation is altered to take two additional arguments: a u32 request_mask and an unsigned int flags that indicate the synchronisation mode. This change is propagated to the vfs_getattr*() function. Functions like vfs_stat() are now inline wrappers around new functions vfs_statx() and vfs_statx_fd() to reduce stack usage. ======== OVERVIEW ======== The idea was initially proposed as a set of xattrs that could be retrieved with getxattr(), but the general preference proved to be for a new syscall with an extended stat structure. A number of requests were gathered for features to be included. The following have been included: (1) Make the fields a consistent size on all arches and make them large. (2) Spare space, request flags and information flags are provided for future expansion. (3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an __s64). (4) Creation time: The SMB protocol carries the creation time, which could be exported by Samba, which will in turn help CIFS make use of FS-Cache as that can be used for coherency data (stx_btime). This is also specified in NFSv4 as a recommended attribute and could be exported by NFSD [Steve French]. (5) Lightweight stat: Ask for just those details of interest, and allow a netfs (such as NFS) to approximate anything not of interest, possibly without going to the server [Trond Myklebust, Ulrich Drepper, Andreas Dilger] (AT_STATX_DONT_SYNC). (6) Heavyweight stat: Force a netfs to go to the server, even if it thinks its cached attributes are up to date [Trond Myklebust] (AT_STATX_FORCE_SYNC). And the following have been left out for future extension: (7) Data version number: Could be used by userspace NFS servers [Aneesh Kumar]. Can also be used to modify fill_post_wcc() in NFSD which retrieves i_version directly, but has just called vfs_getattr(). It could get it from the kstat struct if it used vfs_xgetattr() instead. (There's disagreement on the exact semantics of a single field, since not all filesystems do this the same way). (8) BSD stat compatibility: Including more fields from the BSD stat such as creation time (st_btime) and inode generation number (st_gen) [Jeremy Allison, Bernd Schubert]. (9) Inode generation number: Useful for FUSE and userspace NFS servers [Bernd Schubert]. (This was asked for but later deemed unnecessary with the open-by-handle capability available and caused disagreement as to whether it's a security hole or not). (10) Extra coherency data may be useful in making backups [Andreas Dilger]. (No particular data were offered, but things like last backup timestamp, the data version number and the DOS archive bit would come into this category). (11) Allow the filesystem to indicate what it can/cannot provide: A filesystem can now say it doesn't support a standard stat feature if that isn't available, so if, for instance, inode numbers or UIDs don't exist or are fabricated locally... (This requires a separate system call - I have an fsinfo() call idea for this). (12) Store a 16-byte volume ID in the superblock that can be returned in struct xstat [Steve French]. (Deferred to fsinfo). (13) Include granularity fields in the time data to indicate the granularity of each of the times (NFSv4 time_delta) [Steve French]. (Deferred to fsinfo). (14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags. Note that the Linux IOC flags are a mess and filesystems such as Ext4 define flags that aren't in linux/fs.h, so translation in the kernel may be a necessity (or, possibly, we provide the filesystem type too). (Some attributes are made available in stx_attributes, but the general feeling was that the IOC flags were to ext[234]-specific and shouldn't be exposed through statx this way). (15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer, Michael Kerrisk]. (Deferred, probably to fsinfo. Finding out if there's an ACL or seclabal might require extra filesystem operations). (16) Femtosecond-resolution timestamps [Dave Chinner]. (A __reserved field has been left in the statx_timestamp struct for this - if there proves to be a need). (17) A set multiple attributes syscall to go with this. =============== NEW SYSTEM CALL =============== The new system call is: int ret = statx(int dfd, const char *filename, unsigned int flags, unsigned int mask, struct statx *buffer); The dfd, filename and flags parameters indicate the file to query, in a similar way to fstatat(). There is no equivalent of lstat() as that can be emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is also no equivalent of fstat() as that can be emulated by passing a NULL filename to statx() with the fd of interest in dfd. Whether or not statx() synchronises the attributes with the backing store can be controlled by OR'ing a value into the flags argument (this typically only affects network filesystems): (1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this respect. (2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise its attributes with the server - which might require data writeback to occur to get the timestamps correct. (3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a network filesystem. The resulting values should be considered approximate. mask is a bitmask indicating the fields in struct statx that are of interest to the caller. The user should set this to STATX_BASIC_STATS to get the basic set returned by stat(). It should be noted that asking for more information may entail extra I/O operations. buffer points to the destination for the data. This must be 256 bytes in size. ====================== MAIN ATTRIBUTES RECORD ====================== The following structures are defined in which to return the main attribute set: struct statx_timestamp { __s64 tv_sec; __s32 tv_nsec; __s32 __reserved; }; struct statx { __u32 stx_mask; __u32 stx_blksize; __u64 stx_attributes; __u32 stx_nlink; __u32 stx_uid; __u32 stx_gid; __u16 stx_mode; __u16 __spare0[1]; __u64 stx_ino; __u64 stx_size; __u64 stx_blocks; __u64 __spare1[1]; struct statx_timestamp stx_atime; struct statx_timestamp stx_btime; struct statx_timestamp stx_ctime; struct statx_timestamp stx_mtime; __u32 stx_rdev_major; __u32 stx_rdev_minor; __u32 stx_dev_major; __u32 stx_dev_minor; __u64 __spare2[14]; }; The defined bits in request_mask and stx_mask are: STATX_TYPE Want/got stx_mode & S_IFMT STATX_MODE Want/got stx_mode & ~S_IFMT STATX_NLINK Want/got stx_nlink STATX_UID Want/got stx_uid STATX_GID Want/got stx_gid STATX_ATIME Want/got stx_atime{,_ns} STATX_MTIME Want/got stx_mtime{,_ns} STATX_CTIME Want/got stx_ctime{,_ns} STATX_INO Want/got stx_ino STATX_SIZE Want/got stx_size STATX_BLOCKS Want/got stx_blocks STATX_BASIC_STATS [The stuff in the normal stat struct] STATX_BTIME Want/got stx_btime{,_ns} STATX_ALL [All currently available stuff] stx_btime is the file creation time, stx_mask is a bitmask indicating the data provided and __spares*[] are where as-yet undefined fields can be placed. Time fields are structures with separate seconds and nanoseconds fields plus a reserved field in case we want to add even finer resolution. Note that times will be negative if before 1970; in such a case, the nanosecond fields will also be negative if not zero. The bits defined in the stx_attributes field convey information about a file, how it is accessed, where it is and what it does. The following attributes map to FS_*_FL flags and are the same numerical value: STATX_ATTR_COMPRESSED File is compressed by the fs STATX_ATTR_IMMUTABLE File is marked immutable STATX_ATTR_APPEND File is append-only STATX_ATTR_NODUMP File is not to be dumped STATX_ATTR_ENCRYPTED File requires key to decrypt in fs Within the kernel, the supported flags are listed by: KSTAT_ATTR_FS_IOC_FLAGS [Are any other IOC flags of sufficient general interest to be exposed through this interface?] New flags include: STATX_ATTR_AUTOMOUNT Object is an automount trigger These are for the use of GUI tools that might want to mark files specially, depending on what they are. Fields in struct statx come in a number of classes: (0) stx_dev_*, stx_blksize. These are local system information and are always available. (1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino, stx_size, stx_blocks. These will be returned whether the caller asks for them or not. The corresponding bits in stx_mask will be set to indicate whether they actually have valid values. If the caller didn't ask for them, then they may be approximated. For example, NFS won't waste any time updating them from the server, unless as a byproduct of updating something requested. If the values don't actually exist for the underlying object (such as UID or GID on a DOS file), then the bit won't be set in the stx_mask, even if the caller asked for the value. In such a case, the returned value will be a fabrication. Note that there are instances where the type might not be valid, for instance Windows reparse points. (2) stx_rdev_*. This will be set only if stx_mode indicates we're looking at a blockdev or a chardev, otherwise will be 0. (3) stx_btime. Similar to (1), except this will be set to 0 if it doesn't exist. ======= TESTING ======= The following test program can be used to test the statx system call: samples/statx/test-statx.c Just compile and run, passing it paths to the files you want to examine. The file is built automatically if CONFIG_SAMPLES is enabled. Here's some example output. Firstly, an NFS directory that crosses to another FSID. Note that the AUTOMOUNT attribute is set because transiting this directory will cause d_automount to be invoked by the VFS. [root@andromeda ~]# /tmp/test-statx -A /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:26 Inode: 1703937 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------) Secondly, the result of automounting on that directory. [root@andromeda ~]# /tmp/test-statx /warthog/data statx(/warthog/data) = 0 results=7ff Size: 4096 Blocks: 8 IO Block: 1048576 directory Device: 00:27 Inode: 2 Links: 125 Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041 Access: 2016-11-24 09:02:12.219699527+0000 Modify: 2016-11-17 10:44:36.225653653+0000 Change: 2016-11-17 10:44:36.225653653+0000 Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
struct inode *inode = d_inode(path->dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
if (ui->flags & UBIFS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (ui->flags & UBIFS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
if (ui->flags & UBIFS_CRYPT_FL)
stat->attributes |= STATX_ATTR_ENCRYPTED;
if (ui->flags & UBIFS_IMMUTABLE_FL)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blksize = UBIFS_BLOCK_SIZE;
stat->size = ui->ui_size;
/*
* Unfortunately, the 'stat()' system call was designed for block
* device based file systems, and it is not appropriate for UBIFS,
* because UBIFS does not have notion of "block". For example, it is
* difficult to tell how many block a directory takes - it actually
* takes less than 300 bytes, but we have to round it to block size,
* which introduces large mistake. This makes utilities like 'du' to
* report completely senseless numbers. This is the reason why UBIFS
* goes the same way as JFFS2 - it reports zero blocks for everything
* but regular files, which makes more sense than reporting completely
* wrong sizes.
*/
if (S_ISREG(inode->i_mode)) {
size = ui->xattr_size;
size += stat->size;
size = ALIGN(size, UBIFS_BLOCK_SIZE);
/*
* Note, user-space expects 512-byte blocks count irrespectively
* of what was reported in @stat->size.
*/
stat->blocks = size >> 9;
} else
stat->blocks = 0;
mutex_unlock(&ui->ui_mutex);
return 0;
}
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
.link = ubifs_link,
.symlink = ubifs_symlink,
.unlink = ubifs_unlink,
.mkdir = ubifs_mkdir,
.rmdir = ubifs_rmdir,
.mknod = ubifs_mknod,
.rename = ubifs_rename,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.listxattr = ubifs_listxattr,
.update_time = ubifs_update_time,
.tmpfile = ubifs_tmpfile,
.fileattr_get = ubifs_fileattr_get,
.fileattr_set = ubifs_fileattr_set,
};
const struct file_operations ubifs_dir_operations = {
.llseek = generic_file_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
.iterate_shared = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
};