2016-03-08 19:14:20 +00:00
|
|
|
/*
|
2017-08-28 18:23:27 +00:00
|
|
|
* Copyright(c) 2016 - 2017 Intel Corporation.
|
2016-03-08 19:14:20 +00:00
|
|
|
*
|
|
|
|
* This file is provided under a dual BSD/GPLv2 license. When using or
|
|
|
|
* redistributing this file, you may do so under either license.
|
|
|
|
*
|
|
|
|
* GPL LICENSE SUMMARY
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of version 2 of the GNU General Public License as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful, but
|
|
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* BSD LICENSE
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* - Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in
|
|
|
|
* the documentation and/or other materials provided with the
|
|
|
|
* distribution.
|
|
|
|
* - Neither the name of Intel Corporation nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived
|
|
|
|
* from this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
#include <linux/list.h>
|
2016-05-12 17:23:09 +00:00
|
|
|
#include <linux/rculist.h>
|
2016-03-08 19:14:20 +00:00
|
|
|
#include <linux/mmu_notifier.h>
|
2016-03-08 19:14:53 +00:00
|
|
|
#include <linux/interval_tree_generic.h>
|
2016-03-08 19:14:20 +00:00
|
|
|
|
|
|
|
#include "mmu_rb.h"
|
|
|
|
#include "trace.h"
|
|
|
|
|
|
|
|
struct mmu_rb_handler {
|
|
|
|
struct mmu_notifier mn;
|
2017-09-08 23:15:08 +00:00
|
|
|
struct rb_root_cached root;
|
2016-07-28 19:21:20 +00:00
|
|
|
void *ops_arg;
|
2016-03-08 19:14:20 +00:00
|
|
|
spinlock_t lock; /* protect the RB tree */
|
|
|
|
struct mmu_rb_ops *ops;
|
2016-07-28 19:21:19 +00:00
|
|
|
struct mm_struct *mm;
|
2016-07-28 19:21:27 +00:00
|
|
|
struct list_head lru_list;
|
2016-07-28 19:21:24 +00:00
|
|
|
struct work_struct del_work;
|
|
|
|
struct list_head del_list;
|
|
|
|
struct workqueue_struct *wq;
|
2016-03-08 19:14:20 +00:00
|
|
|
};
|
|
|
|
|
2016-03-08 19:14:53 +00:00
|
|
|
static unsigned long mmu_node_start(struct mmu_rb_node *);
|
|
|
|
static unsigned long mmu_node_last(struct mmu_rb_node *);
|
2018-08-22 04:52:33 +00:00
|
|
|
static int mmu_notifier_range_start(struct mmu_notifier *,
|
2017-11-06 14:38:30 +00:00
|
|
|
struct mm_struct *,
|
2018-08-22 04:52:33 +00:00
|
|
|
unsigned long, unsigned long, bool);
|
2016-03-08 19:14:20 +00:00
|
|
|
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
|
|
|
|
unsigned long, unsigned long);
|
2016-07-28 19:21:24 +00:00
|
|
|
static void do_remove(struct mmu_rb_handler *handler,
|
|
|
|
struct list_head *del_list);
|
|
|
|
static void handle_remove(struct work_struct *work);
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-11-19 09:47:48 +00:00
|
|
|
static const struct mmu_notifier_ops mn_opts = {
|
mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks
Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu
notifiers") prevented the oom reaper from unmapping private anonymous
memory with the oom reaper when the oom victim mm had mmu notifiers
registered.
The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.
That concern is real, but only true for mmu notifiers that have
blockable invalidate_range_{start,end}() callbacks. This patch adds a
"flags" field to mmu notifier ops that can set a bit to indicate that
these callbacks do not block.
The implementation is steered toward an expensive slowpath, such as
after the oom reaper has grabbed mm->mmap_sem of a still alive oom
victim.
[rientjes@google.com: mmu_notifier_invalidate_range_end() can also call the invalidate_range() must not block, fix comment]
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801091339570.240101@chino.kir.corp.google.com
[akpm@linux-foundation.org: make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()]
Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141329500.74052@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Dimitri Sivanich <sivanich@hpe.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-01 00:18:32 +00:00
|
|
|
.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
|
2016-03-08 19:14:20 +00:00
|
|
|
.invalidate_range_start = mmu_notifier_range_start,
|
|
|
|
};
|
|
|
|
|
2016-03-08 19:14:53 +00:00
|
|
|
INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
|
|
|
|
mmu_node_start, mmu_node_last, static, __mmu_int_rb);
|
|
|
|
|
|
|
|
static unsigned long mmu_node_start(struct mmu_rb_node *node)
|
|
|
|
{
|
|
|
|
return node->addr & PAGE_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long mmu_node_last(struct mmu_rb_node *node)
|
|
|
|
{
|
2016-04-12 17:46:41 +00:00
|
|
|
return PAGE_ALIGN(node->addr + node->len) - 1;
|
2016-03-08 19:14:53 +00:00
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:20 +00:00
|
|
|
int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
|
|
|
|
struct mmu_rb_ops *ops,
|
2016-07-28 19:21:24 +00:00
|
|
|
struct workqueue_struct *wq,
|
2016-07-28 19:21:20 +00:00
|
|
|
struct mmu_rb_handler **handler)
|
2016-03-08 19:14:20 +00:00
|
|
|
{
|
|
|
|
struct mmu_rb_handler *handlr;
|
2016-07-28 19:21:19 +00:00
|
|
|
int ret;
|
2016-03-08 19:14:20 +00:00
|
|
|
|
|
|
|
handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
|
|
|
|
if (!handlr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-09-08 23:15:08 +00:00
|
|
|
handlr->root = RB_ROOT_CACHED;
|
2016-03-08 19:14:20 +00:00
|
|
|
handlr->ops = ops;
|
2016-07-28 19:21:20 +00:00
|
|
|
handlr->ops_arg = ops_arg;
|
2016-03-08 19:14:20 +00:00
|
|
|
INIT_HLIST_NODE(&handlr->mn.hlist);
|
|
|
|
spin_lock_init(&handlr->lock);
|
|
|
|
handlr->mn.ops = &mn_opts;
|
2016-07-28 19:21:19 +00:00
|
|
|
handlr->mm = mm;
|
2016-07-28 19:21:24 +00:00
|
|
|
INIT_WORK(&handlr->del_work, handle_remove);
|
|
|
|
INIT_LIST_HEAD(&handlr->del_list);
|
2016-07-28 19:21:27 +00:00
|
|
|
INIT_LIST_HEAD(&handlr->lru_list);
|
2016-07-28 19:21:24 +00:00
|
|
|
handlr->wq = wq;
|
2016-07-28 19:21:19 +00:00
|
|
|
|
|
|
|
ret = mmu_notifier_register(&handlr->mn, handlr->mm);
|
|
|
|
if (ret) {
|
|
|
|
kfree(handlr);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:20 +00:00
|
|
|
*handler = handlr;
|
|
|
|
return 0;
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:20 +00:00
|
|
|
void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
|
2016-03-08 19:14:20 +00:00
|
|
|
{
|
2016-07-28 16:27:36 +00:00
|
|
|
struct mmu_rb_node *rbnode;
|
|
|
|
struct rb_node *node;
|
2016-03-08 19:14:25 +00:00
|
|
|
unsigned long flags;
|
2016-07-28 19:21:24 +00:00
|
|
|
struct list_head del_list;
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-04-12 17:46:35 +00:00
|
|
|
/* Unregister first so we don't get any more notifications. */
|
2016-07-28 19:21:19 +00:00
|
|
|
mmu_notifier_unregister(&handler->mn, handler->mm);
|
2016-04-12 17:46:35 +00:00
|
|
|
|
2016-07-28 19:21:24 +00:00
|
|
|
/*
|
|
|
|
* Make sure the wq delete handler is finished running. It will not
|
|
|
|
* be triggered once the mmu notifiers are unregistered above.
|
|
|
|
*/
|
|
|
|
flush_work(&handler->del_work);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
|
2016-04-12 17:46:35 +00:00
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
2017-09-08 23:15:08 +00:00
|
|
|
while ((node = rb_first_cached(&handler->root))) {
|
2016-07-28 16:27:36 +00:00
|
|
|
rbnode = rb_entry(node, struct mmu_rb_node, node);
|
2017-09-08 23:15:08 +00:00
|
|
|
rb_erase_cached(node, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
/* move from LRU list to delete list */
|
|
|
|
list_move(&rbnode->list, &del_list);
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
2016-04-12 17:46:35 +00:00
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-07-28 19:21:24 +00:00
|
|
|
do_remove(handler, &del_list);
|
|
|
|
|
2016-03-08 19:14:20 +00:00
|
|
|
kfree(handler);
|
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:20 +00:00
|
|
|
int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
|
|
|
|
struct mmu_rb_node *mnode)
|
2016-03-08 19:14:20 +00:00
|
|
|
{
|
2016-03-08 19:14:53 +00:00
|
|
|
struct mmu_rb_node *node;
|
2016-03-08 19:14:25 +00:00
|
|
|
unsigned long flags;
|
2016-03-08 19:14:53 +00:00
|
|
|
int ret = 0;
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2017-08-28 18:23:27 +00:00
|
|
|
trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len);
|
2016-03-08 19:14:25 +00:00
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
2016-03-08 19:14:53 +00:00
|
|
|
node = __mmu_rb_search(handler, mnode->addr, mnode->len);
|
|
|
|
if (node) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
2016-07-28 19:21:20 +00:00
|
|
|
__mmu_int_rb_insert(mnode, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
list_add(&mnode->list, &handler->lru_list);
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-07-28 19:21:20 +00:00
|
|
|
ret = handler->ops->insert(handler->ops_arg, mnode);
|
2016-07-28 19:21:27 +00:00
|
|
|
if (ret) {
|
2016-07-28 19:21:20 +00:00
|
|
|
__mmu_int_rb_remove(mnode, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
list_del(&mnode->list); /* remove from LRU list */
|
|
|
|
}
|
2016-03-08 19:14:20 +00:00
|
|
|
unlock:
|
2016-03-08 19:14:25 +00:00
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
2016-03-08 19:14:20 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-04-12 17:46:03 +00:00
|
|
|
/* Caller must hold handler lock */
|
2016-03-08 19:14:20 +00:00
|
|
|
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
|
|
|
|
unsigned long addr,
|
|
|
|
unsigned long len)
|
|
|
|
{
|
2016-03-08 19:15:10 +00:00
|
|
|
struct mmu_rb_node *node = NULL;
|
2016-03-08 19:14:53 +00:00
|
|
|
|
2017-08-28 18:23:27 +00:00
|
|
|
trace_hfi1_mmu_rb_search(addr, len);
|
2016-03-08 19:15:10 +00:00
|
|
|
if (!handler->ops->filter) {
|
2016-07-28 19:21:20 +00:00
|
|
|
node = __mmu_int_rb_iter_first(&handler->root, addr,
|
2016-03-08 19:15:10 +00:00
|
|
|
(addr + len) - 1);
|
|
|
|
} else {
|
2016-07-28 19:21:20 +00:00
|
|
|
for (node = __mmu_int_rb_iter_first(&handler->root, addr,
|
2016-03-08 19:15:10 +00:00
|
|
|
(addr + len) - 1);
|
|
|
|
node;
|
|
|
|
node = __mmu_int_rb_iter_next(node, addr,
|
|
|
|
(addr + len) - 1)) {
|
|
|
|
if (handler->ops->filter(node, addr, len))
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
}
|
2016-03-08 19:14:53 +00:00
|
|
|
return node;
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
|
|
|
|
2017-05-26 12:35:12 +00:00
|
|
|
bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
|
|
|
|
unsigned long addr, unsigned long len,
|
|
|
|
struct mmu_rb_node **rb_node)
|
IB/hfi1: Extract and reinsert MMU RB node on lookup
The page pinning function, which also maintains the pin cache,
behaves one of two ways when an exact buffer match is not found:
1. If no node is not found (a buffer with the same starting address
is not found in the cache), a new node is created, the buffer
pages are pinned, and the node is inserted into the RB tree, or
2. If a node is found but the buffer in that node is a subset of
the new user buffer, the node is extended with the new buffer
pages.
Both modes of operation require (re-)insertion into the interval RB
tree.
When the node being inserted is a new node, the operations are pretty
simple. However, when the node is already existing and is being
extended, special care must be taken.
First, we want to guard against an asynchronous attempt to
delete the node by the MMU invalidation notifier. The simplest way to
do this is to remove the node from the RB tree, preventing the search
algorithm from finding it.
Second, the node needs to be re-inserted so it lands in the proper place
in the tree and the tree is correctly re-balanced. This also requires
the node to be removed from the RB tree.
This commit adds the hfi1_mmu_rb_extract() function, which will search
for a node in the interval RB tree matching an address and length and
remove it from the RB tree if found. This allows for both of the above
special cases be handled in a single step.
Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-04-12 17:46:47 +00:00
|
|
|
{
|
|
|
|
struct mmu_rb_node *node;
|
|
|
|
unsigned long flags;
|
2017-05-26 12:35:12 +00:00
|
|
|
bool ret = false;
|
IB/hfi1: Extract and reinsert MMU RB node on lookup
The page pinning function, which also maintains the pin cache,
behaves one of two ways when an exact buffer match is not found:
1. If no node is not found (a buffer with the same starting address
is not found in the cache), a new node is created, the buffer
pages are pinned, and the node is inserted into the RB tree, or
2. If a node is found but the buffer in that node is a subset of
the new user buffer, the node is extended with the new buffer
pages.
Both modes of operation require (re-)insertion into the interval RB
tree.
When the node being inserted is a new node, the operations are pretty
simple. However, when the node is already existing and is being
extended, special care must be taken.
First, we want to guard against an asynchronous attempt to
delete the node by the MMU invalidation notifier. The simplest way to
do this is to remove the node from the RB tree, preventing the search
algorithm from finding it.
Second, the node needs to be re-inserted so it lands in the proper place
in the tree and the tree is correctly re-balanced. This also requires
the node to be removed from the RB tree.
This commit adds the hfi1_mmu_rb_extract() function, which will search
for a node in the interval RB tree matching an address and length and
remove it from the RB tree if found. This allows for both of the above
special cases be handled in a single step.
Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-04-12 17:46:47 +00:00
|
|
|
|
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
|
|
|
node = __mmu_rb_search(handler, addr, len);
|
2016-07-28 19:21:27 +00:00
|
|
|
if (node) {
|
2017-05-26 12:35:12 +00:00
|
|
|
if (node->addr == addr && node->len == len)
|
|
|
|
goto unlock;
|
2016-07-28 19:21:20 +00:00
|
|
|
__mmu_int_rb_remove(node, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
list_del(&node->list); /* remove from LRU list */
|
2017-05-26 12:35:12 +00:00
|
|
|
ret = true;
|
2016-07-28 19:21:27 +00:00
|
|
|
}
|
2017-05-26 12:35:12 +00:00
|
|
|
unlock:
|
IB/hfi1: Extract and reinsert MMU RB node on lookup
The page pinning function, which also maintains the pin cache,
behaves one of two ways when an exact buffer match is not found:
1. If no node is not found (a buffer with the same starting address
is not found in the cache), a new node is created, the buffer
pages are pinned, and the node is inserted into the RB tree, or
2. If a node is found but the buffer in that node is a subset of
the new user buffer, the node is extended with the new buffer
pages.
Both modes of operation require (re-)insertion into the interval RB
tree.
When the node being inserted is a new node, the operations are pretty
simple. However, when the node is already existing and is being
extended, special care must be taken.
First, we want to guard against an asynchronous attempt to
delete the node by the MMU invalidation notifier. The simplest way to
do this is to remove the node from the RB tree, preventing the search
algorithm from finding it.
Second, the node needs to be re-inserted so it lands in the proper place
in the tree and the tree is correctly re-balanced. This also requires
the node to be removed from the RB tree.
This commit adds the hfi1_mmu_rb_extract() function, which will search
for a node in the interval RB tree matching an address and length and
remove it from the RB tree if found. This allows for both of the above
special cases be handled in a single step.
Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-04-12 17:46:47 +00:00
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
2017-05-26 12:35:12 +00:00
|
|
|
*rb_node = node;
|
|
|
|
return ret;
|
IB/hfi1: Extract and reinsert MMU RB node on lookup
The page pinning function, which also maintains the pin cache,
behaves one of two ways when an exact buffer match is not found:
1. If no node is not found (a buffer with the same starting address
is not found in the cache), a new node is created, the buffer
pages are pinned, and the node is inserted into the RB tree, or
2. If a node is found but the buffer in that node is a subset of
the new user buffer, the node is extended with the new buffer
pages.
Both modes of operation require (re-)insertion into the interval RB
tree.
When the node being inserted is a new node, the operations are pretty
simple. However, when the node is already existing and is being
extended, special care must be taken.
First, we want to guard against an asynchronous attempt to
delete the node by the MMU invalidation notifier. The simplest way to
do this is to remove the node from the RB tree, preventing the search
algorithm from finding it.
Second, the node needs to be re-inserted so it lands in the proper place
in the tree and the tree is correctly re-balanced. This also requires
the node to be removed from the RB tree.
This commit adds the hfi1_mmu_rb_extract() function, which will search
for a node in the interval RB tree matching an address and length and
remove it from the RB tree if found. This allows for both of the above
special cases be handled in a single step.
Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-04-12 17:46:47 +00:00
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:22 +00:00
|
|
|
void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
|
|
|
|
{
|
2016-07-28 19:21:27 +00:00
|
|
|
struct mmu_rb_node *rbnode, *ptr;
|
2016-07-28 19:21:22 +00:00
|
|
|
struct list_head del_list;
|
|
|
|
unsigned long flags;
|
|
|
|
bool stop = false;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
|
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
2016-07-28 19:21:27 +00:00
|
|
|
list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list,
|
|
|
|
list) {
|
2016-07-28 19:21:22 +00:00
|
|
|
if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
|
|
|
|
&stop)) {
|
|
|
|
__mmu_int_rb_remove(rbnode, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
/* move from LRU list to delete list */
|
|
|
|
list_move(&rbnode->list, &del_list);
|
2016-07-28 19:21:22 +00:00
|
|
|
}
|
|
|
|
if (stop)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
|
|
|
|
|
|
|
while (!list_empty(&del_list)) {
|
|
|
|
rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
|
|
|
|
list_del(&rbnode->list);
|
2016-07-28 19:21:25 +00:00
|
|
|
handler->ops->remove(handler->ops_arg, rbnode);
|
2016-07-28 19:21:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-28 19:21:24 +00:00
|
|
|
/*
|
|
|
|
* It is up to the caller to ensure that this function does not race with the
|
|
|
|
* mmu invalidate notifier which may be calling the users remove callback on
|
|
|
|
* 'node'.
|
|
|
|
*/
|
2016-07-28 19:21:20 +00:00
|
|
|
void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
|
|
|
|
struct mmu_rb_node *node)
|
2016-03-08 19:14:20 +00:00
|
|
|
{
|
2016-07-28 16:27:31 +00:00
|
|
|
unsigned long flags;
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-07-28 16:27:31 +00:00
|
|
|
/* Validity of handler and node pointers has been checked by caller. */
|
2017-08-28 18:23:27 +00:00
|
|
|
trace_hfi1_mmu_rb_remove(node->addr, node->len);
|
2016-07-28 16:27:31 +00:00
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
2016-07-28 19:21:20 +00:00
|
|
|
__mmu_int_rb_remove(node, &handler->root);
|
2016-07-28 19:21:27 +00:00
|
|
|
list_del(&node->list); /* remove from LRU list */
|
2016-07-28 16:27:31 +00:00
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
|
|
|
|
2016-07-28 19:21:25 +00:00
|
|
|
handler->ops->remove(handler->ops_arg, node);
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
|
|
|
|
2018-08-22 04:52:33 +00:00
|
|
|
static int mmu_notifier_range_start(struct mmu_notifier *mn,
|
2017-11-06 14:38:30 +00:00
|
|
|
struct mm_struct *mm,
|
|
|
|
unsigned long start,
|
2018-08-22 04:52:33 +00:00
|
|
|
unsigned long end,
|
|
|
|
bool blockable)
|
2016-03-08 19:14:20 +00:00
|
|
|
{
|
|
|
|
struct mmu_rb_handler *handler =
|
|
|
|
container_of(mn, struct mmu_rb_handler, mn);
|
2017-09-08 23:15:08 +00:00
|
|
|
struct rb_root_cached *root = &handler->root;
|
2016-04-12 17:45:57 +00:00
|
|
|
struct mmu_rb_node *node, *ptr = NULL;
|
2016-03-08 19:14:53 +00:00
|
|
|
unsigned long flags;
|
2016-07-28 19:21:24 +00:00
|
|
|
bool added = false;
|
2016-03-08 19:14:20 +00:00
|
|
|
|
2016-03-08 19:14:25 +00:00
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
2016-04-12 17:45:57 +00:00
|
|
|
for (node = __mmu_int_rb_iter_first(root, start, end - 1);
|
|
|
|
node; node = ptr) {
|
|
|
|
/* Guard against node removal. */
|
|
|
|
ptr = __mmu_int_rb_iter_next(node, start, end - 1);
|
2017-08-28 18:23:27 +00:00
|
|
|
trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
|
2016-07-28 19:21:20 +00:00
|
|
|
if (handler->ops->invalidate(handler->ops_arg, node)) {
|
IB/hfi1: Fix buffer cache races which may cause corruption
There are two possible causes for node/memory corruption both
of which are related to the cache eviction algorithm. One way
to cause corruption is due to the asynchronous nature of the
MMU invalidation and the locking used when invalidating node.
The MMU invalidation routine would temporarily release the
RB tree lock to avoid a deadlock. However, this would allow
the eviction function to take the lock resulting in the removal
of cache nodes.
If the node being removed by the eviction code is the same as
the node being invalidated, the result is use after free.
The same is true in the other direction due to the temporary
release of the eviction list lock in the eviction loop.
Another corner case exists when dealing with the SDMA buffer
cache that could cause memory corruption of kernel memory.
The most common way, in which this corruption exhibits itself
is a linked list node corruption. In that case, the kernel will
complain that a node with poisoned pointers is being removed.
The fact that the pointers are already poisoned means that the
node has already been removed from the list.
To root cause of this corruption was a mishandling of the
eviction list maintained by the driver. In order for this
to happen four conditions need to be satisfied:
1. A node describing a user buffer already exists in the
interval RB tree,
2. The beginning of the current user buffer matches that
node but is bigger. This will cause the node to be
extended.
3. The amount of cached buffers is close or at the limit
of the buffer cache size.
4. The node has dropped close to the end of the eviction
list. This will cause the node to be considered for
eviction.
If all of the above conditions have been satisfied, it is
possible for the eviction algorithm to evict the current node,
which will free the node without the driver knowing.
To solve both issues described above:
- the locking around the MMU invalidation loop and cache
eviction loop has been improved so locks are not released in
the loop body,
- a new RB function is introduced which will "atomically" find
and remove the matching node from the RB tree, preventing the
MMU invalidation loop from touching it, and
- the node being extended by the pin_vector_pages() function is
removed from the eviction list prior to calling the eviction
function.
Reviewed-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-04-12 17:46:53 +00:00
|
|
|
__mmu_int_rb_remove(node, root);
|
2016-07-28 19:21:27 +00:00
|
|
|
/* move from LRU list to delete list */
|
|
|
|
list_move(&node->list, &handler->del_list);
|
2016-07-28 19:21:24 +00:00
|
|
|
added = true;
|
2016-04-12 17:46:03 +00:00
|
|
|
}
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|
2016-03-08 19:14:25 +00:00
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
2016-07-28 19:21:24 +00:00
|
|
|
|
|
|
|
if (added)
|
|
|
|
queue_work(handler->wq, &handler->del_work);
|
2018-08-22 04:52:33 +00:00
|
|
|
|
|
|
|
return 0;
|
2016-07-28 19:21:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call the remove function for the given handler and the list. This
|
|
|
|
* is expected to be called with a delete list extracted from handler.
|
|
|
|
* The caller should not be holding the handler lock.
|
|
|
|
*/
|
|
|
|
static void do_remove(struct mmu_rb_handler *handler,
|
|
|
|
struct list_head *del_list)
|
|
|
|
{
|
|
|
|
struct mmu_rb_node *node;
|
|
|
|
|
|
|
|
while (!list_empty(del_list)) {
|
|
|
|
node = list_first_entry(del_list, struct mmu_rb_node, list);
|
|
|
|
list_del(&node->list);
|
2016-07-28 19:21:25 +00:00
|
|
|
handler->ops->remove(handler->ops_arg, node);
|
2016-07-28 19:21:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Work queue function to remove all nodes that have been queued up to
|
|
|
|
* be removed. The key feature is that mm->mmap_sem is not being held
|
|
|
|
* and the remove callback can sleep while taking it, if needed.
|
|
|
|
*/
|
|
|
|
static void handle_remove(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct mmu_rb_handler *handler = container_of(work,
|
|
|
|
struct mmu_rb_handler,
|
|
|
|
del_work);
|
|
|
|
struct list_head del_list;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/* remove anything that is queued to get removed */
|
|
|
|
spin_lock_irqsave(&handler->lock, flags);
|
|
|
|
list_replace_init(&handler->del_list, &del_list);
|
|
|
|
spin_unlock_irqrestore(&handler->lock, flags);
|
|
|
|
|
|
|
|
do_remove(handler, &del_list);
|
2016-03-08 19:14:20 +00:00
|
|
|
}
|