Commit 1194372d authored by Alexander Gordeev's avatar Alexander Gordeev Committed by Heiko Carstens
Browse files

s390/mm: better annotate 2KB pagetable fragments handling



Explicitly encode immediate value of pending remove nibble
(bits 31-28) and tracking nibble (bits 27-24) of the page
refcount whenever these nibbles are tested or changed, for
better readability. Also, add some comments describing how
the fragments are handled.

Reviewed-by: default avatarGerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: default avatarAlexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parent c2c22493
Loading
Loading
Loading
Loading
+107 −20
Original line number Diff line number Diff line
@@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */

/*
 * page table entry allocation/free routines.
 * A 2KB-pgtable is either upper or lower half of a normal page.
 * The second half of the page may be unused or used as another
 * 2KB-pgtable.
 *
 * Whenever possible the parent page for a new 2KB-pgtable is picked
 * from the list of partially allocated pages mm_context_t::pgtable_list.
 * In case the list is empty a new parent page is allocated and added to
 * the list.
 *
 * When a parent page gets fully allocated it contains 2KB-pgtables in both
 * upper and lower halves and is removed from mm_context_t::pgtable_list.
 *
 * When 2KB-pgtable is freed from to fully allocated parent page that
 * page turns partially allocated and added to mm_context_t::pgtable_list.
 *
 * If 2KB-pgtable is freed from the partially allocated parent page that
 * page turns unused and gets removed from mm_context_t::pgtable_list.
 * Furthermore, the unused parent page is released.
 *
 * As follows from the above, no unallocated or fully allocated parent
 * pages are contained in mm_context_t::pgtable_list.
 *
 * The upper byte (bits 24-31) of the parent page _refcount is used
 * for tracking contained 2KB-pgtables and has the following format:
 *
 *   PP  AA
 * 01234567    upper byte (bits 24-31) of struct page::_refcount
 *   ||  ||
 *   ||  |+--- upper 2KB-pgtable is allocated
 *   ||  +---- lower 2KB-pgtable is allocated
 *   |+------- upper 2KB-pgtable is pending for removal
 *   +-------- lower 2KB-pgtable is pending for removal
 *
 * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
 * using _refcount is possible).
 *
 * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
 * The parent page is either:
 *   - added to mm_context_t::pgtable_list in case the second half of the
 *     parent page is still unallocated;
 *   - removed from mm_context_t::pgtable_list in case both hales of the
 *     parent page are allocated;
 * These operations are protected with mm_context_t::lock.
 *
 * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
 * and the corresponding PP bit is set to 1 in a single atomic operation.
 * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
 * exclusive and may never be both set to 1!
 * The parent page is either:
 *   - added to mm_context_t::pgtable_list in case the second half of the
 *     parent page is still allocated;
 *   - removed from mm_context_t::pgtable_list in case the second half of
 *     the parent page is unallocated;
 * These operations are protected with mm_context_t::lock.
 *
 * It is important to understand that mm_context_t::lock only protects
 * mm_context_t::pgtable_list and AA bits, but not the parent page itself
 * and PP bits.
 *
 * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
 * while both AA bits and the second PP bit are already unset. Then the
 * parent page does not contain any 2KB-pgtable fragment anymore, and it has
 * also been removed from mm_context_t::pgtable_list. It is safe to release
 * the page therefore.
 *
 * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
 * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
 * while the PP bits are never used, nor such a page is added to or removed
 * from mm_context_t::pgtable_list.
 */
unsigned long *page_table_alloc(struct mm_struct *mm)
{
@@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
			page = list_first_entry(&mm->context.pgtable_list,
						struct page, lru);
			mask = atomic_read(&page->_refcount) >> 24;
			mask = (mask | (mask >> 4)) & 3;
			if (mask != 3) {
			/*
			 * The pending removal bits must also be checked.
			 * Failure to do so might lead to an impossible
			 * value of (i.e 0x13 or 0x23) written to _refcount.
			 * Such values violate the assumption that pending and
			 * allocation bits are mutually exclusive, and the rest
			 * of the code unrails as result. That could lead to
			 * a whole bunch of races and corruptions.
			 */
			mask = (mask | (mask >> 4)) & 0x03U;
			if (mask != 0x03U) {
				table = (unsigned long *) page_to_virt(page);
				bit = mask & 1;		/* =1 -> second 2K */
				if (bit)
					table += PTRS_PER_PTE;
				atomic_xor_bits(&page->_refcount,
							1U << (bit + 24));
							0x01U << (bit + 24));
				list_del(&page->lru);
			}
		}
@@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
	table = (unsigned long *) page_to_virt(page);
	if (mm_alloc_pgste(mm)) {
		/* Return 4K page table with PGSTEs */
		atomic_xor_bits(&page->_refcount, 3 << 24);
		atomic_xor_bits(&page->_refcount, 0x03U << 24);
		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
	} else {
		/* Return the first 2K fragment of the page */
		atomic_xor_bits(&page->_refcount, 1 << 24);
		atomic_xor_bits(&page->_refcount, 0x01U << 24);
		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
		spin_lock_bh(&mm->context.lock);
		list_add(&page->lru, &mm->context.pgtable_list);
@@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
		/* Free 2K page table fragment of a 4K page */
		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
		spin_lock_bh(&mm->context.lock);
		/*
		 * Mark the page for delayed release. The actual release
		 * will happen outside of the critical section from this
		 * function or from __tlb_remove_table()
		 */
		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
		mask >>= 24;
		if (mask & 3)
		if (mask & 0x03U)
			list_add(&page->lru, &mm->context.pgtable_list);
		else
			list_del(&page->lru);
		spin_unlock_bh(&mm->context.lock);
		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
		mask >>= 24;
		if (mask != 0)
		if (mask != 0x00U)
			return;
	} else {
		atomic_xor_bits(&page->_refcount, 3U << 24);
		atomic_xor_bits(&page->_refcount, 0x03U << 24);
	}

	pgtable_pte_page_dtor(page);
@@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
	page = virt_to_page(table);
	if (mm_alloc_pgste(mm)) {
		gmap_unlink(mm, table, vmaddr);
		table = (unsigned long *) ((unsigned long)table | 3);
		table = (unsigned long *) ((unsigned long)table | 0x03U);
		tlb_remove_table(tlb, table);
		return;
	}
	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
	spin_lock_bh(&mm->context.lock);
	/*
	 * Mark the page for delayed release. The actual release will happen
	 * outside of the critical section from __tlb_remove_table() or from
	 * page_table_free()
	 */
	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
	mask >>= 24;
	if (mask & 3)
	if (mask & 0x03U)
		list_add_tail(&page->lru, &mm->context.pgtable_list);
	else
		list_del(&page->lru);
	spin_unlock_bh(&mm->context.lock);
	table = (unsigned long *) ((unsigned long) table | (1U << bit));
	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
	tlb_remove_table(tlb, table);
}

void __tlb_remove_table(void *_table)
{
	unsigned int mask = (unsigned long) _table & 3;
	unsigned int mask = (unsigned long) _table & 0x03U;
	void *table = (void *)((unsigned long) _table ^ mask);
	struct page *page = virt_to_page(table);

	switch (mask) {
	case 0:		/* pmd, pud, or p4d */
	case 0x00U:	/* pmd, pud, or p4d */
		free_pages((unsigned long) table, 2);
		break;
	case 1:		/* lower 2K of a 4K page table */
	case 2:		/* higher 2K of a 4K page table */
	case 0x01U:	/* lower 2K of a 4K page table */
	case 0x02U:	/* higher 2K of a 4K page table */
		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
		mask >>= 24;
		if (mask != 0)
		if (mask != 0x00U)
			break;
		fallthrough;
	case 3:		/* 4K page table with pgstes */
		if (mask & 3)
			atomic_xor_bits(&page->_refcount, 3 << 24);
	case 0x03U:	/* 4K page table with pgstes */
		if (mask & 0x03U)
			atomic_xor_bits(&page->_refcount, 0x03U << 24);
		pgtable_pte_page_dtor(page);
		__free_page(page);
		break;