linux/arch/s390/include/asm/mmu_context.h
Gerald Schaefer 723cacbd9d s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.

Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.

Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.

Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2016-04-21 09:50:09 +02:00

168 lines
4.3 KiB
C

/*
* S390 version
*
* Derived from "include/asm-i386/mmu_context.h"
*/
#ifndef __S390_MMU_CONTEXT_H
#define __S390_MMU_CONTEXT_H
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/ctl_reg.h>
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
spin_lock_init(&mm->context.list_lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.attach_count, 0);
mm->context.flush_mm = 0;
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste;
mm->context.has_pgste = 0;
mm->context.use_skey = 0;
#endif
switch (mm->context.asce_limit) {
case 1UL << 42:
/*
* forked 3-level task, fall through to set new asce with new
* mm->pgd
*/
case 0:
/* context created by exec, set asce limit to 4TB */
mm->context.asce_limit = STACK_TOP_MAX;
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
break;
case 1UL << 53:
/* forked 4-level task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
break;
case 1UL << 31:
/* forked 2-level compat task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
/* pgd_alloc() did not increase mm->nr_pmds */
mm_inc_nr_pmds(mm);
}
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
return 0;
}
#define destroy_context(mm) do { } while (0)
static inline void set_user_asce(struct mm_struct *mm)
{
S390_lowcore.user_asce = mm->context.asce;
if (current->thread.mm_segment.ar4)
__ctl_load(S390_lowcore.user_asce, 7, 7);
set_cpu_flag(CIF_ASCE);
}
static inline void clear_user_asce(void)
{
S390_lowcore.user_asce = S390_lowcore.kernel_asce;
__ctl_load(S390_lowcore.user_asce, 1, 1);
__ctl_load(S390_lowcore.user_asce, 7, 7);
}
static inline void load_kernel_asce(void)
{
unsigned long asce;
__ctl_store(asce, 1, 1);
if (asce != S390_lowcore.kernel_asce)
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
set_cpu_flag(CIF_ASCE);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
int cpu = smp_processor_id();
S390_lowcore.user_asce = next->context.asce;
if (prev == next)
return;
if (MACHINE_HAS_TLB_LC)
cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
/* Clear old ASCE by loading the kernel ASCE. */
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
__ctl_load(S390_lowcore.kernel_asce, 7, 7);
atomic_inc(&next->context.attach_count);
atomic_dec(&prev->context.attach_count);
if (MACHINE_HAS_TLB_LC)
cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
static inline void finish_arch_post_lock_switch(void)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
load_kernel_asce();
if (mm) {
preempt_disable();
while (atomic_read(&mm->context.attach_count) >> 16)
cpu_relax();
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
if (mm->context.flush_mm)
__tlb_flush_mm(mm);
preempt_enable();
}
set_fs(current->thread.mm_segment);
}
#define enter_lazy_tlb(mm,tsk) do { } while (0)
#define deactivate_mm(tsk,mm) do { } while (0)
static inline void activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
switch_mm(prev, next, current);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
set_user_asce(next);
}
static inline void arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
}
static inline void arch_unmap(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
}
static inline void arch_bprm_mm_init(struct mm_struct *mm,
struct vm_area_struct *vma)
{
}
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* __S390_MMU_CONTEXT_H */