2013-01-18 09:42:19 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* vineetg: May 2011
|
|
|
|
* -Folded PAGE_PRESENT (used by VM) and PAGE_VALID (used by MMU) into 1.
|
|
|
|
* They are semantically the same although in different contexts
|
|
|
|
* VALID marks a TLB entry exists and it will only happen if PRESENT
|
|
|
|
* - Utilise some unused free bits to confine PTE flags to 12 bits
|
|
|
|
* This is a must for 4k pg-sz
|
|
|
|
*
|
|
|
|
* vineetg: Mar 2011 - changes to accomodate MMU TLB Page Descriptor mods
|
|
|
|
* -TLB Locking never really existed, except for initial specs
|
|
|
|
* -SILENT_xxx not needed for our port
|
|
|
|
* -Per my request, MMU V3 changes the layout of some of the bits
|
|
|
|
* to avoid a few shifts in TLB Miss handlers.
|
|
|
|
*
|
|
|
|
* vineetg: April 2010
|
|
|
|
* -PGD entry no longer contains any flags. If empty it is 0, otherwise has
|
|
|
|
* Pg-Tbl ptr. Thus pmd_present(), pmd_valid(), pmd_set( ) become simpler
|
|
|
|
*
|
|
|
|
* vineetg: April 2010
|
|
|
|
* -Switched form 8:11:13 split for page table lookup to 11:8:13
|
|
|
|
* -this speeds up page table allocation itself as we now have to memset 1K
|
|
|
|
* instead of 8k per page table.
|
|
|
|
* -TODO: Right now page table alloc is 8K and rest 7K is unused
|
|
|
|
* need to optimise it
|
|
|
|
*
|
|
|
|
* Amit Bhor, Sameer Dhavale: Codito Technologies 2004
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _ASM_ARC_PGTABLE_H
|
|
|
|
#define _ASM_ARC_PGTABLE_H
|
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm-generic/pgtable-nopmd.h>
|
|
|
|
|
|
|
|
/**************************************************************************
|
|
|
|
* Page Table Flags
|
|
|
|
*
|
|
|
|
* ARC700 MMU only deals with softare managed TLB entries.
|
|
|
|
* Page Tables are purely for Linux VM's consumption and the bits below are
|
|
|
|
* suited to that (uniqueness). Hence some are not implemented in the TLB and
|
|
|
|
* some have different value in TLB.
|
|
|
|
* e.g. MMU v2: K_READ bit is 8 and so is GLOBAL (possible becoz they live in
|
|
|
|
* seperate PD0 and PD1, which combined forms a translation entry)
|
|
|
|
* while for PTE perspective, they are 8 and 9 respectively
|
|
|
|
* with MMU v3: Most bits (except SHARED) represent the exact hardware pos
|
|
|
|
* (saves some bit shift ops in TLB Miss hdlrs)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if (CONFIG_ARC_MMU_VER <= 2)
|
|
|
|
|
|
|
|
#define _PAGE_ACCESSED (1<<1) /* Page is accessed (S) */
|
|
|
|
#define _PAGE_CACHEABLE (1<<2) /* Page is cached (H) */
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
#define _PAGE_EXECUTE (1<<3) /* Page has user execute perm (H) */
|
|
|
|
#define _PAGE_WRITE (1<<4) /* Page has user write perm (H) */
|
|
|
|
#define _PAGE_READ (1<<5) /* Page has user read perm (H) */
|
2013-06-17 14:14:06 +00:00
|
|
|
#define _PAGE_MODIFIED (1<<6) /* Page modified (dirty) (S) */
|
|
|
|
#define _PAGE_FILE (1<<7) /* page cache/ swap (S) */
|
|
|
|
#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */
|
|
|
|
#define _PAGE_PRESENT (1<<10) /* TLB entry is valid (H) */
|
2013-01-18 09:42:19 +00:00
|
|
|
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
#else /* MMU v3 onwards */
|
2013-01-18 09:42:19 +00:00
|
|
|
|
|
|
|
#define _PAGE_CACHEABLE (1<<0) /* Page is cached (H) */
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
#define _PAGE_EXECUTE (1<<1) /* Page has user execute perm (H) */
|
|
|
|
#define _PAGE_WRITE (1<<2) /* Page has user write perm (H) */
|
|
|
|
#define _PAGE_READ (1<<3) /* Page has user read perm (H) */
|
2013-06-17 14:14:06 +00:00
|
|
|
#define _PAGE_ACCESSED (1<<4) /* Page is accessed (S) */
|
|
|
|
#define _PAGE_MODIFIED (1<<5) /* Page modified (dirty) (S) */
|
|
|
|
#define _PAGE_FILE (1<<6) /* page cache/ swap (S) */
|
2013-01-18 09:42:19 +00:00
|
|
|
#define _PAGE_GLOBAL (1<<8) /* Page is global (H) */
|
|
|
|
#define _PAGE_PRESENT (1<<9) /* TLB entry is valid (H) */
|
2013-06-17 14:14:06 +00:00
|
|
|
#define _PAGE_SHARED_CODE (1<<11) /* Shared Code page with cmn vaddr
|
2013-01-18 09:42:19 +00:00
|
|
|
usable for shared TLB entries (H) */
|
|
|
|
#endif
|
|
|
|
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
/* vmalloc permissions */
|
|
|
|
#define _K_PAGE_PERMS (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ | \
|
ARC: copy_(to|from)_user() to honor usermode-access permissions
This manifested as grep failing psuedo-randomly:
-------------->8---------------------
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$
[ARCLinux]$ ip address show lo | grep inet
inet 127.0.0.1/8 scope host lo
-------------->8---------------------
ARC700 MMU provides fully orthogonal permission bits per page:
Ur, Uw, Ux, Kr, Kw, Kx
The user mode page permission templates used to have all Kernel mode
access bits enabled.
This caused a tricky race condition observed with uClibc buffered file
read and UNIX pipes.
1. Read access to an anon mapped page in libc .bss: write-protected
zero_page mapped: TLB Entry installed with Ur + K[rwx]
2. grep calls libc:getc() -> buffered read layer calls read(2) with the
internal read buffer in same .bss page.
The read() call is on STDIN which has been redirected to a pipe.
read(2) => sys_read() => pipe_read() => copy_to_user()
3. Since page has Kernel-write permission (despite being user-mode
write-protected), copy_to_user() suceeds w/o taking a MMU TLB-Miss
Exception (page-fault for ARC). core-MM is unaware that kernel
erroneously wrote to the reserved read-only zero-page (BUG #1)
4. Control returns to userspace which now does a write to same .bss page
Since Linux MM is not aware that page has been modified by kernel, it
simply reassigns a new writable zero-init page to mapping, loosing the
prior write by kernel - effectively zero'ing out the libc read buffer
under the hood - hence grep doesn't see right data (BUG #2)
The fix is to make all kernel-mode access permissions mirror the
user-mode ones. Note that the kernel still has full access to pages,
when accessed directly (w/o MMU) - this fix ensures that kernel-mode
access in copy_to_from() path uses the same faulting access model as for
pure user accesses to keep MM fully aware of page state.
The issue is peudo-random because it only shows up if the TLB entry
installed in #1 is present at the time of #3. If it is evicted out, due
to TLB pressure or some-such, then copy_to_user() does take a TLB Miss
Exception, with a routine write-to-anon COW processing installing a
fresh page for kernel writes and also usable as it is in userspace.
Further the issue was dormant for so long as it depends on where the
libc internal read buffer (in .bss) is mapped at runtime.
If it happens to reside in file-backed data mapping of libc (in the
page-aligned slack space trailing the file backed data), loader zero
padding the slack space, does the early cow page replacement, setting
things up at the very beginning itself.
With gcc 4.8 based builds, the libc buffer got pushed out to a real
anon mapping which triggers the issue.
Reported-by: Anton Kolesov <akolesov@synopsys.com>
Cc: <stable@vger.kernel.org> # 3.9
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-05-21 09:55:11 +00:00
|
|
|
_PAGE_GLOBAL | _PAGE_PRESENT)
|
2013-01-18 09:42:19 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_ARC_CACHE_PAGES
|
|
|
|
#define _PAGE_DEF_CACHEABLE _PAGE_CACHEABLE
|
|
|
|
#else
|
|
|
|
#define _PAGE_DEF_CACHEABLE (0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Helper for every "user" page
|
|
|
|
* -kernel can R/W/X
|
|
|
|
* -by default cached, unless config otherwise
|
|
|
|
* -present in memory
|
|
|
|
*/
|
ARC: copy_(to|from)_user() to honor usermode-access permissions
This manifested as grep failing psuedo-randomly:
-------------->8---------------------
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$
[ARCLinux]$ ip address show lo | grep inet
inet 127.0.0.1/8 scope host lo
-------------->8---------------------
ARC700 MMU provides fully orthogonal permission bits per page:
Ur, Uw, Ux, Kr, Kw, Kx
The user mode page permission templates used to have all Kernel mode
access bits enabled.
This caused a tricky race condition observed with uClibc buffered file
read and UNIX pipes.
1. Read access to an anon mapped page in libc .bss: write-protected
zero_page mapped: TLB Entry installed with Ur + K[rwx]
2. grep calls libc:getc() -> buffered read layer calls read(2) with the
internal read buffer in same .bss page.
The read() call is on STDIN which has been redirected to a pipe.
read(2) => sys_read() => pipe_read() => copy_to_user()
3. Since page has Kernel-write permission (despite being user-mode
write-protected), copy_to_user() suceeds w/o taking a MMU TLB-Miss
Exception (page-fault for ARC). core-MM is unaware that kernel
erroneously wrote to the reserved read-only zero-page (BUG #1)
4. Control returns to userspace which now does a write to same .bss page
Since Linux MM is not aware that page has been modified by kernel, it
simply reassigns a new writable zero-init page to mapping, loosing the
prior write by kernel - effectively zero'ing out the libc read buffer
under the hood - hence grep doesn't see right data (BUG #2)
The fix is to make all kernel-mode access permissions mirror the
user-mode ones. Note that the kernel still has full access to pages,
when accessed directly (w/o MMU) - this fix ensures that kernel-mode
access in copy_to_from() path uses the same faulting access model as for
pure user accesses to keep MM fully aware of page state.
The issue is peudo-random because it only shows up if the TLB entry
installed in #1 is present at the time of #3. If it is evicted out, due
to TLB pressure or some-such, then copy_to_user() does take a TLB Miss
Exception, with a routine write-to-anon COW processing installing a
fresh page for kernel writes and also usable as it is in userspace.
Further the issue was dormant for so long as it depends on where the
libc internal read buffer (in .bss) is mapped at runtime.
If it happens to reside in file-backed data mapping of libc (in the
page-aligned slack space trailing the file backed data), loader zero
padding the slack space, does the early cow page replacement, setting
things up at the very beginning itself.
With gcc 4.8 based builds, the libc buffer got pushed out to a real
anon mapping which triggers the issue.
Reported-by: Anton Kolesov <akolesov@synopsys.com>
Cc: <stable@vger.kernel.org> # 3.9
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-05-21 09:55:11 +00:00
|
|
|
#define ___DEF (_PAGE_PRESENT | _PAGE_DEF_CACHEABLE)
|
|
|
|
|
2013-01-18 09:42:19 +00:00
|
|
|
/* Set of bits not changed in pte_modify */
|
|
|
|
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_MODIFIED)
|
|
|
|
|
|
|
|
/* More Abbrevaited helpers */
|
|
|
|
#define PAGE_U_NONE __pgprot(___DEF)
|
|
|
|
#define PAGE_U_R __pgprot(___DEF | _PAGE_READ)
|
|
|
|
#define PAGE_U_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE)
|
|
|
|
#define PAGE_U_X_R __pgprot(___DEF | _PAGE_READ | _PAGE_EXECUTE)
|
|
|
|
#define PAGE_U_X_W_R __pgprot(___DEF | _PAGE_READ | _PAGE_WRITE | \
|
|
|
|
_PAGE_EXECUTE)
|
|
|
|
|
|
|
|
#define PAGE_SHARED PAGE_U_W_R
|
|
|
|
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
/* While kernel runs out of unstranslated space, vmalloc/modules use a chunk of
|
|
|
|
* user vaddr space - visible in all addr spaces, but kernel mode only
|
2013-01-18 09:42:19 +00:00
|
|
|
* Thus Global, all-kernel-access, no-user-access, cached
|
|
|
|
*/
|
ARC: copy_(to|from)_user() to honor usermode-access permissions
This manifested as grep failing psuedo-randomly:
-------------->8---------------------
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$
[ARCLinux]$ ip address show lo | grep inet
inet 127.0.0.1/8 scope host lo
-------------->8---------------------
ARC700 MMU provides fully orthogonal permission bits per page:
Ur, Uw, Ux, Kr, Kw, Kx
The user mode page permission templates used to have all Kernel mode
access bits enabled.
This caused a tricky race condition observed with uClibc buffered file
read and UNIX pipes.
1. Read access to an anon mapped page in libc .bss: write-protected
zero_page mapped: TLB Entry installed with Ur + K[rwx]
2. grep calls libc:getc() -> buffered read layer calls read(2) with the
internal read buffer in same .bss page.
The read() call is on STDIN which has been redirected to a pipe.
read(2) => sys_read() => pipe_read() => copy_to_user()
3. Since page has Kernel-write permission (despite being user-mode
write-protected), copy_to_user() suceeds w/o taking a MMU TLB-Miss
Exception (page-fault for ARC). core-MM is unaware that kernel
erroneously wrote to the reserved read-only zero-page (BUG #1)
4. Control returns to userspace which now does a write to same .bss page
Since Linux MM is not aware that page has been modified by kernel, it
simply reassigns a new writable zero-init page to mapping, loosing the
prior write by kernel - effectively zero'ing out the libc read buffer
under the hood - hence grep doesn't see right data (BUG #2)
The fix is to make all kernel-mode access permissions mirror the
user-mode ones. Note that the kernel still has full access to pages,
when accessed directly (w/o MMU) - this fix ensures that kernel-mode
access in copy_to_from() path uses the same faulting access model as for
pure user accesses to keep MM fully aware of page state.
The issue is peudo-random because it only shows up if the TLB entry
installed in #1 is present at the time of #3. If it is evicted out, due
to TLB pressure or some-such, then copy_to_user() does take a TLB Miss
Exception, with a routine write-to-anon COW processing installing a
fresh page for kernel writes and also usable as it is in userspace.
Further the issue was dormant for so long as it depends on where the
libc internal read buffer (in .bss) is mapped at runtime.
If it happens to reside in file-backed data mapping of libc (in the
page-aligned slack space trailing the file backed data), loader zero
padding the slack space, does the early cow page replacement, setting
things up at the very beginning itself.
With gcc 4.8 based builds, the libc buffer got pushed out to a real
anon mapping which triggers the issue.
Reported-by: Anton Kolesov <akolesov@synopsys.com>
Cc: <stable@vger.kernel.org> # 3.9
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-05-21 09:55:11 +00:00
|
|
|
#define PAGE_KERNEL __pgprot(_K_PAGE_PERMS | _PAGE_DEF_CACHEABLE)
|
2013-01-18 09:42:19 +00:00
|
|
|
|
|
|
|
/* ioremap */
|
ARC: copy_(to|from)_user() to honor usermode-access permissions
This manifested as grep failing psuedo-randomly:
-------------->8---------------------
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$ ip address show lo | grep inet
[ARCLinux]$
[ARCLinux]$ ip address show lo | grep inet
inet 127.0.0.1/8 scope host lo
-------------->8---------------------
ARC700 MMU provides fully orthogonal permission bits per page:
Ur, Uw, Ux, Kr, Kw, Kx
The user mode page permission templates used to have all Kernel mode
access bits enabled.
This caused a tricky race condition observed with uClibc buffered file
read and UNIX pipes.
1. Read access to an anon mapped page in libc .bss: write-protected
zero_page mapped: TLB Entry installed with Ur + K[rwx]
2. grep calls libc:getc() -> buffered read layer calls read(2) with the
internal read buffer in same .bss page.
The read() call is on STDIN which has been redirected to a pipe.
read(2) => sys_read() => pipe_read() => copy_to_user()
3. Since page has Kernel-write permission (despite being user-mode
write-protected), copy_to_user() suceeds w/o taking a MMU TLB-Miss
Exception (page-fault for ARC). core-MM is unaware that kernel
erroneously wrote to the reserved read-only zero-page (BUG #1)
4. Control returns to userspace which now does a write to same .bss page
Since Linux MM is not aware that page has been modified by kernel, it
simply reassigns a new writable zero-init page to mapping, loosing the
prior write by kernel - effectively zero'ing out the libc read buffer
under the hood - hence grep doesn't see right data (BUG #2)
The fix is to make all kernel-mode access permissions mirror the
user-mode ones. Note that the kernel still has full access to pages,
when accessed directly (w/o MMU) - this fix ensures that kernel-mode
access in copy_to_from() path uses the same faulting access model as for
pure user accesses to keep MM fully aware of page state.
The issue is peudo-random because it only shows up if the TLB entry
installed in #1 is present at the time of #3. If it is evicted out, due
to TLB pressure or some-such, then copy_to_user() does take a TLB Miss
Exception, with a routine write-to-anon COW processing installing a
fresh page for kernel writes and also usable as it is in userspace.
Further the issue was dormant for so long as it depends on where the
libc internal read buffer (in .bss) is mapped at runtime.
If it happens to reside in file-backed data mapping of libc (in the
page-aligned slack space trailing the file backed data), loader zero
padding the slack space, does the early cow page replacement, setting
things up at the very beginning itself.
With gcc 4.8 based builds, the libc buffer got pushed out to a real
anon mapping which triggers the issue.
Reported-by: Anton Kolesov <akolesov@synopsys.com>
Cc: <stable@vger.kernel.org> # 3.9
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-05-21 09:55:11 +00:00
|
|
|
#define PAGE_KERNEL_NO_CACHE __pgprot(_K_PAGE_PERMS)
|
2013-01-18 09:42:19 +00:00
|
|
|
|
2013-05-14 07:58:17 +00:00
|
|
|
/* Masks for actual TLB "PD"s */
|
ARC: MMUv4 preps/1 - Fold PTE K/U access flags
The current ARC VM code has 13 flags in Page Table entry: some software
(accesed/dirty/non-linear-maps) and rest hardware specific. With 8k MMU
page, we need 19 bits for addressing page frame so remaining 13 bits is
just about enough to accomodate the current flags.
In MMUv4 there are 2 additional flags, SZ (normal or super page) and WT
(cache access mode write-thru) - and additionally PFN is 20 bits (vs. 19
before for 8k). Thus these can't be held in current PTE w/o making each
entry 64bit wide.
It seems there is some scope of compressing the current PTE flags (and
freeing up a few bits). Currently PTE contains fully orthogonal distinct
access permissions for kernel and user mode (Kr, Kw, Kx; Ur, Uw, Ux)
which can be folded into one set (R, W, X). The translation of 3 PTE
bits into 6 TLB bits (when programming the MMU) can be done based on
following pre-requites/assumptions:
1. For kernel-mode-only translations (vmalloc: 0x7000_0000 to
0x7FFF_FFFF), PTE additionally has PAGE_GLOBAL flag set (and user
space entries can never be global). Thus such a PTE can translate
to Kr, Kw, Kx (as appropriate) and zero for User mode counterparts.
2. For non global entries, the PTE flags can be used to create mirrored
K and U TLB bits. This is true after commit a950549c675f2c8c504
"ARC: copy_(to|from)_user() to honor usermode-access permissions"
which ensured that user-space translations _MUST_ have same access
permissions for both U/K mode accesses so that copy_{to,from}_user()
play fair with fault based CoW break and such...
There is no such thing as free lunch - the cost is slightly infalted
TLB-Miss Handlers.
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
2013-06-17 12:42:13 +00:00
|
|
|
#define PTE_BITS_IN_PD0 (_PAGE_GLOBAL | _PAGE_PRESENT)
|
|
|
|
#define PTE_BITS_RWX (_PAGE_EXECUTE | _PAGE_WRITE | _PAGE_READ)
|
|
|
|
#define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE)
|
2013-05-14 07:58:17 +00:00
|
|
|
|
2013-01-18 09:42:19 +00:00
|
|
|
/**************************************************************************
|
|
|
|
* Mapping of vm_flags (Generic VM) to PTE flags (arch specific)
|
|
|
|
*
|
|
|
|
* Certain cases have 1:1 mapping
|
|
|
|
* e.g. __P101 means VM_READ, VM_EXEC and !VM_SHARED
|
|
|
|
* which directly corresponds to PAGE_U_X_R
|
|
|
|
*
|
|
|
|
* Other rules which cause the divergence from 1:1 mapping
|
|
|
|
*
|
|
|
|
* 1. Although ARC700 can do exclusive execute/write protection (meaning R
|
|
|
|
* can be tracked independet of X/W unlike some other CPUs), still to
|
|
|
|
* keep things consistent with other archs:
|
|
|
|
* -Write implies Read: W => R
|
|
|
|
* -Execute implies Read: X => R
|
|
|
|
*
|
|
|
|
* 2. Pvt Writable doesn't have Write Enabled initially: Pvt-W => !W
|
|
|
|
* This is to enable COW mechanism
|
|
|
|
*/
|
|
|
|
/* xwr */
|
|
|
|
#define __P000 PAGE_U_NONE
|
|
|
|
#define __P001 PAGE_U_R
|
|
|
|
#define __P010 PAGE_U_R /* Pvt-W => !W */
|
|
|
|
#define __P011 PAGE_U_R /* Pvt-W => !W */
|
|
|
|
#define __P100 PAGE_U_X_R /* X => R */
|
|
|
|
#define __P101 PAGE_U_X_R
|
|
|
|
#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */
|
|
|
|
#define __P111 PAGE_U_X_R /* Pvt-W => !W */
|
|
|
|
|
|
|
|
#define __S000 PAGE_U_NONE
|
|
|
|
#define __S001 PAGE_U_R
|
|
|
|
#define __S010 PAGE_U_W_R /* W => R */
|
|
|
|
#define __S011 PAGE_U_W_R
|
|
|
|
#define __S100 PAGE_U_X_R /* X => R */
|
|
|
|
#define __S101 PAGE_U_X_R
|
|
|
|
#define __S110 PAGE_U_X_W_R /* X => R */
|
|
|
|
#define __S111 PAGE_U_X_W_R
|
|
|
|
|
|
|
|
/****************************************************************
|
|
|
|
* Page Table Lookup split
|
|
|
|
*
|
|
|
|
* We implement 2 tier paging and since this is all software, we are free
|
|
|
|
* to customize the span of a PGD / PTE entry to suit us
|
|
|
|
*
|
|
|
|
* 32 bit virtual address
|
|
|
|
* -------------------------------------------------------
|
|
|
|
* | BITS_FOR_PGD | BITS_FOR_PTE | BITS_IN_PAGE |
|
|
|
|
* -------------------------------------------------------
|
|
|
|
* | | |
|
|
|
|
* | | --> off in page frame
|
|
|
|
* | |
|
|
|
|
* | ---> index into Page Table
|
|
|
|
* |
|
|
|
|
* ----> index into Page Directory
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define BITS_IN_PAGE PAGE_SHIFT
|
|
|
|
|
|
|
|
/* Optimal Sizing of Pg Tbl - based on MMU page size */
|
|
|
|
#if defined(CONFIG_ARC_PAGE_SIZE_8K)
|
|
|
|
#define BITS_FOR_PTE 8
|
|
|
|
#elif defined(CONFIG_ARC_PAGE_SIZE_16K)
|
|
|
|
#define BITS_FOR_PTE 8
|
|
|
|
#elif defined(CONFIG_ARC_PAGE_SIZE_4K)
|
|
|
|
#define BITS_FOR_PTE 9
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define BITS_FOR_PGD (32 - BITS_FOR_PTE - BITS_IN_PAGE)
|
|
|
|
|
|
|
|
#define PGDIR_SHIFT (BITS_FOR_PTE + BITS_IN_PAGE)
|
|
|
|
#define PGDIR_SIZE (1UL << PGDIR_SHIFT) /* vaddr span, not PDG sz */
|
|
|
|
#define PGDIR_MASK (~(PGDIR_SIZE-1))
|
|
|
|
|
|
|
|
#ifdef __ASSEMBLY__
|
|
|
|
#define PTRS_PER_PTE (1 << BITS_FOR_PTE)
|
|
|
|
#define PTRS_PER_PGD (1 << BITS_FOR_PGD)
|
|
|
|
#else
|
|
|
|
#define PTRS_PER_PTE (1UL << BITS_FOR_PTE)
|
|
|
|
#define PTRS_PER_PGD (1UL << BITS_FOR_PGD)
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* Number of entries a user land program use.
|
|
|
|
* TASK_SIZE is the maximum vaddr that can be used by a userland program.
|
|
|
|
*/
|
|
|
|
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No special requirements for lowest virtual address we permit any user space
|
|
|
|
* mapping to be mapped at.
|
|
|
|
*/
|
|
|
|
#define FIRST_USER_ADDRESS 0
|
|
|
|
|
|
|
|
|
|
|
|
/****************************************************************
|
|
|
|
* Bucket load of VM Helpers
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
|
|
|
|
#define pte_ERROR(e) \
|
|
|
|
pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
|
|
|
|
#define pgd_ERROR(e) \
|
|
|
|
pr_crit("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
|
|
|
|
|
|
|
|
/* the zero page used for uninitialized and anonymous pages */
|
|
|
|
extern char empty_zero_page[PAGE_SIZE];
|
|
|
|
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
|
|
|
|
|
|
|
|
#define pte_unmap(pte) do { } while (0)
|
|
|
|
#define pte_unmap_nested(pte) do { } while (0)
|
|
|
|
|
|
|
|
#define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval))
|
|
|
|
#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
|
|
|
|
|
|
|
|
/* find the page descriptor of the Page Tbl ref by PMD entry */
|
|
|
|
#define pmd_page(pmd) virt_to_page(pmd_val(pmd) & PAGE_MASK)
|
|
|
|
|
|
|
|
/* find the logical addr (phy for ARC) of the Page Tbl ref by PMD entry */
|
|
|
|
#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK)
|
|
|
|
|
|
|
|
/* In a 2 level sys, setup the PGD entry with PTE value */
|
|
|
|
static inline void pmd_set(pmd_t *pmdp, pte_t *ptep)
|
|
|
|
{
|
|
|
|
pmd_val(*pmdp) = (unsigned long)ptep;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define pte_none(x) (!pte_val(x))
|
|
|
|
#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
|
|
|
|
#define pte_clear(mm, addr, ptep) set_pte_at(mm, addr, ptep, __pte(0))
|
|
|
|
|
|
|
|
#define pmd_none(x) (!pmd_val(x))
|
|
|
|
#define pmd_bad(x) ((pmd_val(x) & ~PAGE_MASK))
|
|
|
|
#define pmd_present(x) (pmd_val(x))
|
|
|
|
#define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0)
|
|
|
|
|
|
|
|
#define pte_page(x) (mem_map + \
|
|
|
|
(unsigned long)(((pte_val(x) - PAGE_OFFSET) >> PAGE_SHIFT)))
|
|
|
|
|
|
|
|
#define mk_pte(page, pgprot) \
|
|
|
|
({ \
|
|
|
|
pte_t pte; \
|
|
|
|
pte_val(pte) = __pa(page_address(page)) + pgprot_val(pgprot); \
|
|
|
|
pte; \
|
|
|
|
})
|
|
|
|
|
|
|
|
/* TBD: Non linear mapping stuff */
|
|
|
|
static inline int pte_file(pte_t pte)
|
|
|
|
{
|
|
|
|
return pte_val(pte) & _PAGE_FILE;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define PTE_FILE_MAX_BITS 30
|
|
|
|
#define pgoff_to_pte(x) __pte(x)
|
|
|
|
#define pte_to_pgoff(x) (pte_val(x) >> 2)
|
|
|
|
#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
|
|
|
|
#define pfn_pte(pfn, prot) (__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)))
|
|
|
|
#define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pte_offset gets a @ptr to PMD entry (PGD in our 2-tier paging system)
|
|
|
|
* and returns ptr to PTE entry corresponding to @addr
|
|
|
|
*/
|
|
|
|
#define pte_offset(dir, addr) ((pte_t *)(pmd_page_vaddr(*dir)) +\
|
|
|
|
__pte_index(addr))
|
|
|
|
|
|
|
|
/* No mapping of Page Tables in high mem etc, so following same as above */
|
|
|
|
#define pte_offset_kernel(dir, addr) pte_offset(dir, addr)
|
|
|
|
#define pte_offset_map(dir, addr) pte_offset(dir, addr)
|
|
|
|
|
|
|
|
/* Zoo of pte_xxx function */
|
|
|
|
#define pte_read(pte) (pte_val(pte) & _PAGE_READ)
|
|
|
|
#define pte_write(pte) (pte_val(pte) & _PAGE_WRITE)
|
|
|
|
#define pte_dirty(pte) (pte_val(pte) & _PAGE_MODIFIED)
|
|
|
|
#define pte_young(pte) (pte_val(pte) & _PAGE_ACCESSED)
|
|
|
|
#define pte_special(pte) (0)
|
|
|
|
|
|
|
|
#define PTE_BIT_FUNC(fn, op) \
|
|
|
|
static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
|
|
|
|
|
|
|
|
PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE));
|
|
|
|
PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE));
|
|
|
|
PTE_BIT_FUNC(mkclean, &= ~(_PAGE_MODIFIED));
|
|
|
|
PTE_BIT_FUNC(mkdirty, |= (_PAGE_MODIFIED));
|
|
|
|
PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED));
|
|
|
|
PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED));
|
|
|
|
PTE_BIT_FUNC(exprotect, &= ~(_PAGE_EXECUTE));
|
|
|
|
PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE));
|
|
|
|
|
|
|
|
static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
|
|
|
|
|
|
|
|
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
|
|
|
|
{
|
|
|
|
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Macro to mark a page protection as uncacheable */
|
|
|
|
#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CACHEABLE))
|
|
|
|
|
|
|
|
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pteval)
|
|
|
|
{
|
|
|
|
set_pte(ptep, pteval);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All kernel related VM pages are in init's mm.
|
|
|
|
*/
|
|
|
|
#define pgd_offset_k(address) pgd_offset(&init_mm, address)
|
|
|
|
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
|
|
|
|
#define pgd_offset(mm, addr) (((mm)->pgd)+pgd_index(addr))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Macro to quickly access the PGD entry, utlising the fact that some
|
|
|
|
* arch may cache the pointer to Page Directory of "current" task
|
|
|
|
* in a MMU register
|
|
|
|
*
|
|
|
|
* Thus task->mm->pgd (3 pointer dereferences, cache misses etc simply
|
|
|
|
* becomes read a register
|
|
|
|
*
|
|
|
|
* ********CAUTION*******:
|
|
|
|
* Kernel code might be dealing with some mm_struct of NON "current"
|
|
|
|
* Thus use this macro only when you are certain that "current" is current
|
|
|
|
* e.g. when dealing with signal frame setup code etc
|
|
|
|
*/
|
2013-01-18 09:42:23 +00:00
|
|
|
#ifndef CONFIG_SMP
|
2013-01-18 09:42:19 +00:00
|
|
|
#define pgd_offset_fast(mm, addr) \
|
|
|
|
({ \
|
|
|
|
pgd_t *pgd_base = (pgd_t *) read_aux_reg(ARC_REG_SCRATCH_DATA0); \
|
|
|
|
pgd_base + pgd_index(addr); \
|
|
|
|
})
|
2013-01-18 09:42:23 +00:00
|
|
|
#else
|
|
|
|
#define pgd_offset_fast(mm, addr) pgd_offset(mm, addr)
|
|
|
|
#endif
|
2013-01-18 09:42:19 +00:00
|
|
|
|
|
|
|
extern void paging_init(void);
|
|
|
|
extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
|
|
|
|
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
pte_t *ptep);
|
|
|
|
|
|
|
|
/* Encode swap {type,off} tuple into PTE
|
|
|
|
* We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that
|
|
|
|
* both PAGE_FILE and PAGE_PRESENT are zero in a PTE holding swap "identifier"
|
|
|
|
*/
|
|
|
|
#define __swp_entry(type, off) ((swp_entry_t) { \
|
|
|
|
((type) & 0x1f) | ((off) << 13) })
|
|
|
|
|
|
|
|
/* Decode a PTE containing swap "identifier "into constituents */
|
|
|
|
#define __swp_type(pte_lookalike) (((pte_lookalike).val) & 0x1f)
|
|
|
|
#define __swp_offset(pte_lookalike) ((pte_lookalike).val << 13)
|
|
|
|
|
|
|
|
/* NOPs, to keep generic kernel happy */
|
|
|
|
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
|
|
|
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
|
|
|
|
|
|
|
|
#define kern_addr_valid(addr) (1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* remap a physical page `pfn' of size `size' with page protection `prot'
|
|
|
|
* into virtual address `from'
|
|
|
|
*/
|
|
|
|
#include <asm-generic/pgtable.h>
|
|
|
|
|
2013-05-09 13:50:43 +00:00
|
|
|
/* to cope with aliasing VIPT cache */
|
|
|
|
#define HAVE_ARCH_UNMAPPED_AREA
|
|
|
|
|
2013-01-18 09:42:19 +00:00
|
|
|
/*
|
|
|
|
* No page table caches to initialise
|
|
|
|
*/
|
|
|
|
#define pgtable_cache_init() do { } while (0)
|
|
|
|
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
|
|
|
|
#endif
|