From b03b99a329a14b7302f37c3ea6da3848db41c8c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 27 Mar 2017 21:53:38 -0700 Subject: [PATCH 1/6] acpi, nfit, libnvdimm: fix interleave set cookie calculation (64-bit comparison) While reviewing the -stable patch for commit 86ef58a4e35e "nfit, libnvdimm: fix interleave set cookie calculation" Ben noted: "This is returning an int, thus it's effectively doing a 32-bit comparison and not the 64-bit comparison you say is needed." Update the compare operation to be immune to this integer demotion problem. Cc: Cc: Nicholas Moulin Fixes: 86ef58a4e35e ("nfit, libnvdimm: fix interleave set cookie calculation") Reported-by: Ben Hutchings Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 662036bdc65e..c8ea9d698cd0 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1617,7 +1617,11 @@ static int cmp_map(const void *m0, const void *m1) const struct nfit_set_info_map *map0 = m0; const struct nfit_set_info_map *map1 = m1; - return map0->region_offset - map1->region_offset; + if (map0->region_offset < map1->region_offset) + return -1; + else if (map0->region_offset > map1->region_offset) + return 1; + return 0; } /* Retrieve the nth entry referencing this spa */ From fe514739d8538783749d3ce72f78e5a999ea5668 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Apr 2017 15:08:36 -0700 Subject: [PATCH 2/6] libnvdimm: fix blk free space accounting Commit a1f3e4d6a0c3 "libnvdimm, region: update nd_region_available_dpa() for multi-pmem support" reworked blk dpa (DIMM Physical Address) accounting to comprehend multiple pmem namespace allocations aliasing with a given blk-dpa range. The following call trace is a result of failing to account for allocated blk capacity. WARNING: CPU: 1 PID: 2433 at tools/testing/nvdimm/../../../drivers/nvdimm/names 4 size_store+0x6f3/0x930 [libnvdimm] nd_region region5: allocation underrun: 0x0 of 0x1000000 bytes [..] Call Trace: dump_stack+0x86/0xc3 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5f/0x80 size_store+0x6f3/0x930 [libnvdimm] dev_attr_store+0x18/0x30 If a given blk-dpa allocation does not alias with any pmem ranges then the full allocation should be accounted as busy space, not the size of the current pmem contribution to the region. The thinkos that led to this confusion was not realizing that the struct resource management is already guaranteeing no collisions between pmem allocations and blk allocations on the same dimm. Also, we do not try to support blk allocations in aliased pmem holes. This patch also fixes a case where the available blk goes negative. Cc: Fixes: a1f3e4d6a0c3 ("libnvdimm, region: update nd_region_available_dpa() for multi-pmem support"). Reported-by: Dariusz Dokupil Reported-by: Dave Jiang Reported-by: Vishal Verma Tested-by: Dave Jiang Tested-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/dimm_devs.c | 77 ++++++-------------------------------- 1 file changed, 11 insertions(+), 66 deletions(-) diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 0eedc49e0d47..8b721321be5b 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -395,7 +395,7 @@ EXPORT_SYMBOL_GPL(nvdimm_create); int alias_dpa_busy(struct device *dev, void *data) { - resource_size_t map_end, blk_start, new, busy; + resource_size_t map_end, blk_start, new; struct blk_alloc_info *info = data; struct nd_mapping *nd_mapping; struct nd_region *nd_region; @@ -436,29 +436,19 @@ int alias_dpa_busy(struct device *dev, void *data) retry: /* * Find the free dpa from the end of the last pmem allocation to - * the end of the interleave-set mapping that is not already - * covered by a blk allocation. + * the end of the interleave-set mapping. */ - busy = 0; for_each_dpa_resource(ndd, res) { + if (strncmp(res->name, "pmem", 4) != 0) + continue; if ((res->start >= blk_start && res->start < map_end) || (res->end >= blk_start && res->end <= map_end)) { - if (strncmp(res->name, "pmem", 4) == 0) { - new = max(blk_start, min(map_end + 1, - res->end + 1)); - if (new != blk_start) { - blk_start = new; - goto retry; - } - } else - busy += min(map_end, res->end) - - max(nd_mapping->start, res->start) + 1; - } else if (nd_mapping->start > res->start - && map_end < res->end) { - /* total eclipse of the PMEM region mapping */ - busy += nd_mapping->size; - break; + new = max(blk_start, min(map_end + 1, res->end + 1)); + if (new != blk_start) { + blk_start = new; + goto retry; + } } } @@ -470,52 +460,11 @@ int alias_dpa_busy(struct device *dev, void *data) return 1; } - info->available -= blk_start - nd_mapping->start + busy; + info->available -= blk_start - nd_mapping->start; return 0; } -static int blk_dpa_busy(struct device *dev, void *data) -{ - struct blk_alloc_info *info = data; - struct nd_mapping *nd_mapping; - struct nd_region *nd_region; - resource_size_t map_end; - int i; - - if (!is_nd_pmem(dev)) - return 0; - - nd_region = to_nd_region(dev); - for (i = 0; i < nd_region->ndr_mappings; i++) { - nd_mapping = &nd_region->mapping[i]; - if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) - break; - } - - if (i >= nd_region->ndr_mappings) - return 0; - - map_end = nd_mapping->start + nd_mapping->size - 1; - if (info->res->start >= nd_mapping->start - && info->res->start < map_end) { - if (info->res->end <= map_end) { - info->busy = 0; - return 1; - } else { - info->busy -= info->res->end - map_end; - return 0; - } - } else if (info->res->end >= nd_mapping->start - && info->res->end <= map_end) { - info->busy -= nd_mapping->start - info->res->start; - return 0; - } else { - info->busy -= nd_mapping->size; - return 0; - } -} - /** * nd_blk_available_dpa - account the unused dpa of BLK region * @nd_mapping: container of dpa-resource-root + labels @@ -545,11 +494,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) for_each_dpa_resource(ndd, res) { if (strncmp(res->name, "blk", 3) != 0) continue; - - info.res = res; - info.busy = resource_size(res); - device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy); - info.available -= info.busy; + info.available -= resource_size(res); } return info.available; From 0beb2012a1722633515c8aaa263c73449636c893 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Apr 2017 09:47:24 -0700 Subject: [PATCH 3/6] libnvdimm: fix reconfig_mutex, mmap_sem, and jbd2_handle lockdep splat Holding the reconfig_mutex over a potential userspace fault sets up a lockdep dependency chain between filesystem-DAX and the libnvdimm ioctl path. Move the user access outside of the lock. [ INFO: possible circular locking dependency detected ] 4.11.0-rc3+ #13 Tainted: G W O ------------------------------------------------------- fallocate/16656 is trying to acquire lock: (&nvdimm_bus->reconfig_mutex){+.+.+.}, at: [] nvdimm_bus_lock+0x21/0x30 [libnvdimm] but task is already holding lock: (jbd2_handle){++++..}, at: [] start_this_handle+0x104/0x460 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (jbd2_handle){++++..}: lock_acquire+0xbd/0x200 start_this_handle+0x16a/0x460 jbd2__journal_start+0xe9/0x2d0 __ext4_journal_start_sb+0x89/0x1c0 ext4_dirty_inode+0x32/0x70 __mark_inode_dirty+0x235/0x670 generic_update_time+0x87/0xd0 touch_atime+0xa9/0xd0 ext4_file_mmap+0x90/0xb0 mmap_region+0x370/0x5b0 do_mmap+0x415/0x4f0 vm_mmap_pgoff+0xd7/0x120 SyS_mmap_pgoff+0x1c5/0x290 SyS_mmap+0x22/0x30 entry_SYSCALL_64_fastpath+0x1f/0xc2 -> #1 (&mm->mmap_sem){++++++}: lock_acquire+0xbd/0x200 __might_fault+0x70/0xa0 __nd_ioctl+0x683/0x720 [libnvdimm] nvdimm_ioctl+0x8b/0xe0 [libnvdimm] do_vfs_ioctl+0xa8/0x740 SyS_ioctl+0x79/0x90 do_syscall_64+0x6c/0x200 return_from_SYSCALL_64+0x0/0x7a -> #0 (&nvdimm_bus->reconfig_mutex){+.+.+.}: __lock_acquire+0x16b6/0x1730 lock_acquire+0xbd/0x200 __mutex_lock+0x88/0x9b0 mutex_lock_nested+0x1b/0x20 nvdimm_bus_lock+0x21/0x30 [libnvdimm] nvdimm_forget_poison+0x25/0x50 [libnvdimm] nvdimm_clear_poison+0x106/0x140 [libnvdimm] pmem_do_bvec+0x1c2/0x2b0 [nd_pmem] pmem_make_request+0xf9/0x270 [nd_pmem] generic_make_request+0x118/0x3b0 submit_bio+0x75/0x150 Cc: Fixes: 62232e45f4a2 ("libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices") Cc: Dave Jiang Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 23d4a1728cdf..351bac8f6503 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -934,8 +934,14 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); if (rc < 0) goto out_unlock; + nvdimm_bus_unlock(&nvdimm_bus->dev); + if (copy_to_user(p, buf, buf_len)) rc = -EFAULT; + + vfree(buf); + return rc; + out_unlock: nvdimm_bus_unlock(&nvdimm_bus->dev); out: From 4aa5615e080a9855e607accc75b07ab79b252dde Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Apr 2017 12:25:52 -0700 Subject: [PATCH 4/6] libnvdimm: band aid btt vs clear poison locking The following warning results from holding a lane spinlock, preempt_disable(), or the btt map spinlock and then trying to take the reconfig_mutex to walk the poison list and potentially add new entries. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:747 in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd [..] Call Trace: dump_stack+0x85/0xc8 ___might_sleep+0x184/0x250 __might_sleep+0x4a/0x90 __mutex_lock+0x58/0x9b0 ? nvdimm_bus_lock+0x21/0x30 [libnvdimm] ? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm] ? acpi_nfit_forget_poison+0x79/0x80 [nfit] ? _raw_spin_unlock+0x27/0x40 mutex_lock_nested+0x1b/0x20 nvdimm_bus_lock+0x21/0x30 [libnvdimm] nvdimm_forget_poison+0x25/0x50 [libnvdimm] nvdimm_clear_poison+0x106/0x140 [libnvdimm] nsio_rw_bytes+0x164/0x270 [libnvdimm] btt_write_pg+0x1de/0x3e0 [nd_btt] ? blk_queue_enter+0x30/0x290 btt_make_request+0x11a/0x310 [nd_btt] ? blk_queue_enter+0xb7/0x290 ? blk_queue_enter+0x30/0x290 generic_make_request+0x118/0x3b0 As a minimal fix, disable error clearing when the BTT is enabled for the namespace. For the final fix a larger rework of the poison list locking is needed. Note that this is not a problem in the blk case since that path never calls nvdimm_clear_poison(). Cc: Fixes: 82bf1037f2ca ("libnvdimm: check and clear poison before writing to pmem") Cc: Dave Jiang [jeff: dynamically disable error clearing in the btt case] Suggested-by: Jeff Moyer Reviewed-by: Jeff Moyer Reported-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/claim.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index b3323c0697f6..ca6d572c48fc 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -243,7 +243,15 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, } if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { - if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) { + /* + * FIXME: nsio_rw_bytes() may be called from atomic + * context in the btt case and nvdimm_clear_poison() + * takes a sleeping lock. Until the locking can be + * reworked this capability requires that the namespace + * is not claimed by btt. + */ + if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) + && (!ndns->claim || !is_nd_btt(ndns->claim))) { long cleared; cleared = nvdimm_clear_poison(&ndns->dev, offset, size); From 956a4cd2c957acf638ff29951aabaa9d8e92bbc2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Apr 2017 16:42:08 -0700 Subject: [PATCH 5/6] device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation The following warning triggers with a new unit test that stresses the device-dax interface. =============================== [ ERR: suspicious RCU usage. ] 4.11.0-rc4+ #1049 Tainted: G O ------------------------------- ./include/linux/rcupdate.h:521 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 2 locks held by fio/9070: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x167/0x4f0 #1: (rcu_read_lock){......}, at: [] dax_dev_huge_fault+0x32/0x620 [dax] Call Trace: dump_stack+0x86/0xc3 lockdep_rcu_suspicious+0xd7/0x110 ___might_sleep+0xac/0x250 __might_sleep+0x4a/0x80 __alloc_pages_nodemask+0x23a/0x360 alloc_pages_current+0xa1/0x1f0 pte_alloc_one+0x17/0x80 __pte_alloc+0x1e/0x120 __get_locked_pte+0x1bf/0x1d0 insert_pfn.isra.70+0x3a/0x100 ? lookup_memtype+0xa6/0xd0 vm_insert_mixed+0x64/0x90 dax_dev_huge_fault+0x520/0x620 [dax] ? dax_dev_huge_fault+0x32/0x620 [dax] dax_dev_fault+0x10/0x20 [dax] __do_fault+0x1e/0x140 __handle_mm_fault+0x9af/0x10d0 handle_mm_fault+0x16d/0x370 ? handle_mm_fault+0x47/0x370 __do_page_fault+0x28c/0x4f0 trace_do_page_fault+0x58/0x2a0 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Inserting a page table entry may trigger an allocation while we are holding a read lock to keep the device instance alive for the duration of the fault. Use srcu for this keep-alive protection. Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") Cc: Signed-off-by: Dan Williams --- drivers/dax/Kconfig | 1 + drivers/dax/dax.c | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3e2ab3b14eea..9e95bf94eb13 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -2,6 +2,7 @@ menuconfig DEV_DAX tristate "DAX: direct access to differentiated memory" default m if NVDIMM_DAX depends on TRANSPARENT_HUGEPAGE + select SRCU help Support raw access to differentiated (persistence, bandwidth, latency...) memory via an mmap(2) capable character diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 80c6db279ae1..806f180c80d8 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -25,6 +25,7 @@ #include "dax.h" static dev_t dax_devt; +DEFINE_STATIC_SRCU(dax_srcu); static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); static int nr_dax = CONFIG_NR_DEV_DAX; @@ -60,7 +61,7 @@ struct dax_region { * @region - parent region * @dev - device backing the character device * @cdev - core chardev data - * @alive - !alive + rcu grace period == no new mappings can be established + * @alive - !alive + srcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device * @res - array of physical address ranges @@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) static int dax_dev_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int rc; + int rc, id; struct file *filp = vmf->vma->vm_file; struct dax_dev *dax_dev = filp->private_data; @@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ? "write" : "read", vmf->vma->vm_start, vmf->vma->vm_end); - rcu_read_lock(); + id = srcu_read_lock(&dax_srcu); switch (pe_size) { case PE_SIZE_PTE: rc = __dax_dev_pte_fault(dax_dev, vmf); @@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, default: return VM_FAULT_FALLBACK; } - rcu_read_unlock(); + srcu_read_unlock(&dax_srcu, id); return rc; } @@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev) * Note, rcu is not protecting the liveness of dax_dev, rcu is * ensuring that any fault handlers that might have seen * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort + * that start after synchronize_srcu() has started will abort * upon seeing dax_dev->alive == false. */ dax_dev->alive = false; - synchronize_rcu(); + synchronize_srcu(&dax_srcu); unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); cdev_del(cdev); device_unregister(dev); From 11e63f6d920d6f2dfd3cd421e939a4aec9a58dcd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 6 Apr 2017 09:04:31 -0700 Subject: [PATCH 6/6] x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions Before we rework the "pmem api" to stop abusing __copy_user_nocache() for memcpy_to_pmem() we need to fix cases where we may strand dirty data in the cpu cache. The problem occurs when copy_from_iter_pmem() is used for arbitrary data transfers from userspace. There is no guarantee that these transfers, performed by dax_iomap_actor(), will have aligned destinations or aligned transfer lengths. Backstop the usage __copy_user_nocache() with explicit cache management in these unaligned cases. Yes, copy_from_iter_pmem() is now too big for an inline, but addressing that is saved for a later patch that moves the entirety of the "pmem api" into the pmem driver directly. Fixes: 5de490daec8b ("pmem: add copy_from_iter_pmem() and clear_pmem()") Cc: Cc: Cc: Jan Kara Cc: Jeff Moyer Cc: Ingo Molnar Cc: Christoph Hellwig Cc: "H. Peter Anvin" Cc: Al Viro Cc: Thomas Gleixner Cc: Matthew Wilcox Reviewed-by: Ross Zwisler Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- arch/x86/include/asm/pmem.h | 42 +++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 2c1ebeb4d737..529bb4a6487a 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. */ static inline void arch_wb_cache_pmem(void *addr, size_t size) { @@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size) clwb(p); } -/* - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec - * iterators, so for other types (bvec & kvec) we must do a cache write-back. - */ -static inline bool __iter_needs_pmem_wb(struct iov_iter *i) -{ - return iter_is_iovec(i) == false; -} - /** * arch_copy_from_iter_pmem - copy data from an iterator to PMEM * @addr: PMEM destination address @@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, /* TODO: skip the write-back by always using non-temporal stores */ len = copy_from_iter_nocache(addr, bytes, i); - if (__iter_needs_pmem_wb(i)) + /* + * In the iovec case on x86_64 copy_from_iter_nocache() uses + * non-temporal stores for the bulk of the transfer, but we need + * to manually flush if the transfer is unaligned. A cached + * memory copy is used when destination or size is not naturally + * aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + * + * In the non-iovec case the entire destination needs to be + * flushed. + */ + if (iter_is_iovec(i)) { + unsigned long flushed, dest = (unsigned long) addr; + + if (bytes < 8) { + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) + arch_wb_cache_pmem(addr, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + arch_wb_cache_pmem(addr, 1); + } + + flushed = dest - (unsigned long) addr; + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) + arch_wb_cache_pmem(addr + bytes - 1, 1); + } + } else arch_wb_cache_pmem(addr, bytes); return len;