s390/mm: better annotate 2KB pagetable fragments handling

Explicitly encode immediate value of pending remove nibble
(bits 31-28) and tracking nibble (bits 27-24) of the page
refcount whenever these nibbles are tested or changed, for
better readability. Also, add some comments describing how
the fragments are handled.

Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
Alexander Gordeev 2021-11-04 07:14:45 +01:00 committed by Heiko Carstens
parent c2c224932f
commit 1194372db6

View File

@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */ #endif /* CONFIG_PGSTE */
/* /*
* page table entry allocation/free routines. * A 2KB-pgtable is either upper or lower half of a normal page.
* The second half of the page may be unused or used as another
* 2KB-pgtable.
*
* Whenever possible the parent page for a new 2KB-pgtable is picked
* from the list of partially allocated pages mm_context_t::pgtable_list.
* In case the list is empty a new parent page is allocated and added to
* the list.
*
* When a parent page gets fully allocated it contains 2KB-pgtables in both
* upper and lower halves and is removed from mm_context_t::pgtable_list.
*
* When 2KB-pgtable is freed from to fully allocated parent page that
* page turns partially allocated and added to mm_context_t::pgtable_list.
*
* If 2KB-pgtable is freed from the partially allocated parent page that
* page turns unused and gets removed from mm_context_t::pgtable_list.
* Furthermore, the unused parent page is released.
*
* As follows from the above, no unallocated or fully allocated parent
* pages are contained in mm_context_t::pgtable_list.
*
* The upper byte (bits 24-31) of the parent page _refcount is used
* for tracking contained 2KB-pgtables and has the following format:
*
* PP AA
* 01234567 upper byte (bits 24-31) of struct page::_refcount
* || ||
* || |+--- upper 2KB-pgtable is allocated
* || +---- lower 2KB-pgtable is allocated
* |+------- upper 2KB-pgtable is pending for removal
* +-------- lower 2KB-pgtable is pending for removal
*
* (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
* using _refcount is possible).
*
* When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
* The parent page is either:
* - added to mm_context_t::pgtable_list in case the second half of the
* parent page is still unallocated;
* - removed from mm_context_t::pgtable_list in case both hales of the
* parent page are allocated;
* These operations are protected with mm_context_t::lock.
*
* When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
* and the corresponding PP bit is set to 1 in a single atomic operation.
* Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
* exclusive and may never be both set to 1!
* The parent page is either:
* - added to mm_context_t::pgtable_list in case the second half of the
* parent page is still allocated;
* - removed from mm_context_t::pgtable_list in case the second half of
* the parent page is unallocated;
* These operations are protected with mm_context_t::lock.
*
* It is important to understand that mm_context_t::lock only protects
* mm_context_t::pgtable_list and AA bits, but not the parent page itself
* and PP bits.
*
* Releasing the parent page happens whenever the PP bit turns from 1 to 0,
* while both AA bits and the second PP bit are already unset. Then the
* parent page does not contain any 2KB-pgtable fragment anymore, and it has
* also been removed from mm_context_t::pgtable_list. It is safe to release
* the page therefore.
*
* PGSTE memory spaces use full 4KB-pgtables and do not need most of the
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
* while the PP bits are never used, nor such a page is added to or removed
* from mm_context_t::pgtable_list.
*/ */
unsigned long *page_table_alloc(struct mm_struct *mm) unsigned long *page_table_alloc(struct mm_struct *mm)
{ {
@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = list_first_entry(&mm->context.pgtable_list, page = list_first_entry(&mm->context.pgtable_list,
struct page, lru); struct page, lru);
mask = atomic_read(&page->_refcount) >> 24; mask = atomic_read(&page->_refcount) >> 24;
mask = (mask | (mask >> 4)) & 3; /*
if (mask != 3) { * The pending removal bits must also be checked.
* Failure to do so might lead to an impossible
* value of (i.e 0x13 or 0x23) written to _refcount.
* Such values violate the assumption that pending and
* allocation bits are mutually exclusive, and the rest
* of the code unrails as result. That could lead to
* a whole bunch of races and corruptions.
*/
mask = (mask | (mask >> 4)) & 0x03U;
if (mask != 0x03U) {
table = (unsigned long *) page_to_virt(page); table = (unsigned long *) page_to_virt(page);
bit = mask & 1; /* =1 -> second 2K */ bit = mask & 1; /* =1 -> second 2K */
if (bit) if (bit)
table += PTRS_PER_PTE; table += PTRS_PER_PTE;
atomic_xor_bits(&page->_refcount, atomic_xor_bits(&page->_refcount,
1U << (bit + 24)); 0x01U << (bit + 24));
list_del(&page->lru); list_del(&page->lru);
} }
} }
@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_virt(page); table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) { if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */ /* Return 4K page table with PGSTEs */
atomic_xor_bits(&page->_refcount, 3 << 24); atomic_xor_bits(&page->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else { } else {
/* Return the first 2K fragment of the page */ /* Return the first 2K fragment of the page */
atomic_xor_bits(&page->_refcount, 1 << 24); atomic_xor_bits(&page->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock); spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list); list_add(&page->lru, &mm->context.pgtable_list);
@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
/* Free 2K page table fragment of a 4K page */ /* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock); spin_lock_bh(&mm->context.lock);
/*
* Mark the page for delayed release. The actual release
* will happen outside of the critical section from this
* function or from __tlb_remove_table()
*/
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24; mask >>= 24;
if (mask & 3) if (mask & 0x03U)
list_add(&page->lru, &mm->context.pgtable_list); list_add(&page->lru, &mm->context.pgtable_list);
else else
list_del(&page->lru); list_del(&page->lru);
spin_unlock_bh(&mm->context.lock); spin_unlock_bh(&mm->context.lock);
mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
mask >>= 24; mask >>= 24;
if (mask != 0) if (mask != 0x00U)
return; return;
} else { } else {
atomic_xor_bits(&page->_refcount, 3U << 24); atomic_xor_bits(&page->_refcount, 0x03U << 24);
} }
pgtable_pte_page_dtor(page); pgtable_pte_page_dtor(page);
@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
page = virt_to_page(table); page = virt_to_page(table);
if (mm_alloc_pgste(mm)) { if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr); gmap_unlink(mm, table, vmaddr);
table = (unsigned long *) ((unsigned long)table | 3); table = (unsigned long *) ((unsigned long)table | 0x03U);
tlb_remove_table(tlb, table); tlb_remove_table(tlb, table);
return; return;
} }
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock); spin_lock_bh(&mm->context.lock);
/*
* Mark the page for delayed release. The actual release will happen
* outside of the critical section from __tlb_remove_table() or from
* page_table_free()
*/
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24; mask >>= 24;
if (mask & 3) if (mask & 0x03U)
list_add_tail(&page->lru, &mm->context.pgtable_list); list_add_tail(&page->lru, &mm->context.pgtable_list);
else else
list_del(&page->lru); list_del(&page->lru);
spin_unlock_bh(&mm->context.lock); spin_unlock_bh(&mm->context.lock);
table = (unsigned long *) ((unsigned long) table | (1U << bit)); table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table); tlb_remove_table(tlb, table);
} }
void __tlb_remove_table(void *_table) void __tlb_remove_table(void *_table)
{ {
unsigned int mask = (unsigned long) _table & 3; unsigned int mask = (unsigned long) _table & 0x03U;
void *table = (void *)((unsigned long) _table ^ mask); void *table = (void *)((unsigned long) _table ^ mask);
struct page *page = virt_to_page(table); struct page *page = virt_to_page(table);
switch (mask) { switch (mask) {
case 0: /* pmd, pud, or p4d */ case 0x00U: /* pmd, pud, or p4d */
free_pages((unsigned long) table, 2); free_pages((unsigned long) table, 2);
break; break;
case 1: /* lower 2K of a 4K page table */ case 0x01U: /* lower 2K of a 4K page table */
case 2: /* higher 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
mask >>= 24; mask >>= 24;
if (mask != 0) if (mask != 0x00U)
break; break;
fallthrough; fallthrough;
case 3: /* 4K page table with pgstes */ case 0x03U: /* 4K page table with pgstes */
if (mask & 3) if (mask & 0x03U)
atomic_xor_bits(&page->_refcount, 3 << 24); atomic_xor_bits(&page->_refcount, 0x03U << 24);
pgtable_pte_page_dtor(page); pgtable_pte_page_dtor(page);
__free_page(page); __free_page(page);
break; break;