4b6fad7097
At the moment the userspace tool is expected to request pinning of the entire guest RAM when VFIO IOMMU SPAPR v2 driver is present. When the userspace process finishes, all the pinned pages need to be put; this is done as a part of the userspace memory context (MM) destruction which happens on the very last mmdrop(). This approach has a problem that a MM of the userspace process may live longer than the userspace process itself as kernel threads use userspace process MMs which was runnning on a CPU where the kernel thread was scheduled to. If this happened, the MM remains referenced until this exact kernel thread wakes up again and releases the very last reference to the MM, on an idle system this can take even hours. This moves preregistered regions tracking from MM to VFIO; insteads of using mm_iommu_table_group_mem_t::used, tce_container::prereg_list is added so each container releases regions which it has pre-registered. This changes the userspace interface to return EBUSY if a memory region is already registered in a container. However it should not have any practical effect as the only userspace tool available now does register memory region once per container anyway. As tce_iommu_register_pages/tce_iommu_unregister_pages are called under container->lock, this does not need additional locking. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Acked-by: Alex Williamson <alex.williamson@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
189 lines
4.5 KiB
C
189 lines
4.5 KiB
C
/*
|
|
* MMU context allocation for 64-bit kernels.
|
|
*
|
|
* Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/idr.h>
|
|
#include <linux/export.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include "icswx.h"
|
|
|
|
static DEFINE_SPINLOCK(mmu_context_lock);
|
|
static DEFINE_IDA(mmu_context_ida);
|
|
|
|
int __init_new_context(void)
|
|
{
|
|
int index;
|
|
int err;
|
|
|
|
again:
|
|
if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
|
|
return -ENOMEM;
|
|
|
|
spin_lock(&mmu_context_lock);
|
|
err = ida_get_new_above(&mmu_context_ida, 1, &index);
|
|
spin_unlock(&mmu_context_lock);
|
|
|
|
if (err == -EAGAIN)
|
|
goto again;
|
|
else if (err)
|
|
return err;
|
|
|
|
if (index > MAX_USER_CONTEXT) {
|
|
spin_lock(&mmu_context_lock);
|
|
ida_remove(&mmu_context_ida, index);
|
|
spin_unlock(&mmu_context_lock);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return index;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__init_new_context);
|
|
static int radix__init_new_context(struct mm_struct *mm, int index)
|
|
{
|
|
unsigned long rts_field;
|
|
|
|
/*
|
|
* set the process table entry,
|
|
*/
|
|
rts_field = radix__get_tree_size();
|
|
process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
|
|
return 0;
|
|
}
|
|
|
|
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
|
|
{
|
|
int index;
|
|
|
|
index = __init_new_context();
|
|
if (index < 0)
|
|
return index;
|
|
|
|
if (radix_enabled()) {
|
|
radix__init_new_context(mm, index);
|
|
} else {
|
|
|
|
/* The old code would re-promote on fork, we don't do that
|
|
* when using slices as it could cause problem promoting slices
|
|
* that have been forced down to 4K
|
|
*
|
|
* For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
|
|
* explicitly against context.id == 0. This ensures that we
|
|
* properly initialize context slice details for newly allocated
|
|
* mm's (which will have id == 0) and don't alter context slice
|
|
* inherited via fork (which will have id != 0).
|
|
*
|
|
* We should not be calling init_new_context() on init_mm. Hence a
|
|
* check against 0 is ok.
|
|
*/
|
|
if (mm->context.id == 0)
|
|
slice_set_user_psize(mm, mmu_virtual_psize);
|
|
subpage_prot_init_new_context(mm);
|
|
}
|
|
mm->context.id = index;
|
|
#ifdef CONFIG_PPC_ICSWX
|
|
mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
|
|
if (!mm->context.cop_lockp) {
|
|
__destroy_context(index);
|
|
subpage_prot_free(mm);
|
|
mm->context.id = MMU_NO_CONTEXT;
|
|
return -ENOMEM;
|
|
}
|
|
spin_lock_init(mm->context.cop_lockp);
|
|
#endif /* CONFIG_PPC_ICSWX */
|
|
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
mm->context.pte_frag = NULL;
|
|
#endif
|
|
#ifdef CONFIG_SPAPR_TCE_IOMMU
|
|
mm_iommu_init(mm);
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
void __destroy_context(int context_id)
|
|
{
|
|
spin_lock(&mmu_context_lock);
|
|
ida_remove(&mmu_context_ida, context_id);
|
|
spin_unlock(&mmu_context_lock);
|
|
}
|
|
EXPORT_SYMBOL_GPL(__destroy_context);
|
|
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
static void destroy_pagetable_page(struct mm_struct *mm)
|
|
{
|
|
int count;
|
|
void *pte_frag;
|
|
struct page *page;
|
|
|
|
pte_frag = mm->context.pte_frag;
|
|
if (!pte_frag)
|
|
return;
|
|
|
|
page = virt_to_page(pte_frag);
|
|
/* drop all the pending references */
|
|
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
|
|
/* We allow PTE_FRAG_NR fragments from a PTE page */
|
|
if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) {
|
|
pgtable_page_dtor(page);
|
|
free_hot_cold_page(page, 0);
|
|
}
|
|
}
|
|
|
|
#else
|
|
static inline void destroy_pagetable_page(struct mm_struct *mm)
|
|
{
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
void destroy_context(struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_SPAPR_TCE_IOMMU
|
|
WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
|
|
#endif
|
|
#ifdef CONFIG_PPC_ICSWX
|
|
drop_cop(mm->context.acop, mm);
|
|
kfree(mm->context.cop_lockp);
|
|
mm->context.cop_lockp = NULL;
|
|
#endif /* CONFIG_PPC_ICSWX */
|
|
|
|
if (radix_enabled())
|
|
process_tb[mm->context.id].prtb1 = 0;
|
|
else
|
|
subpage_prot_free(mm);
|
|
destroy_pagetable_page(mm);
|
|
__destroy_context(mm->context.id);
|
|
mm->context.id = MMU_NO_CONTEXT;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_RADIX_MMU
|
|
void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
|
|
{
|
|
asm volatile("isync": : :"memory");
|
|
mtspr(SPRN_PID, next->context.id);
|
|
asm volatile("isync \n"
|
|
PPC_SLBIA(0x7)
|
|
: : :"memory");
|
|
}
|
|
#endif
|