Commit 51d3d5eb authored by David Hildenbrand's avatar David Hildenbrand Committed by Andrew Morton
Browse files

mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA

Currently, we don't enable writenotify when enabling userfaultfd-wp on a
shared writable mapping (for now only shmem and hugetlb).  The consequence
is that vma->vm_page_prot will still include write permissions, to be set
as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting,
page migration, ...).

So far, vma->vm_page_prot is assumed to be a safe default, meaning that we
only add permissions (e.g., mkwrite) but not remove permissions (e.g.,
wrprotect).  For example, when enabling softdirty tracking, we enable
writenotify.  With uffd-wp on shared mappings, that changed.  More details
on vma->vm_page_prot semantics were summarized in [1].

This is problematic for uffd-wp: we'd have to manually check for a uffd-wp
PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone. 
Prone to such issues is any code that uses vma->vm_page_prot to set PTE
permissions: primarily pte_modify() and mk_pte().

Instead, let's enable writenotify such that PTEs/PMDs/...  will be mapped
write-protected as default and we will only allow selected PTEs that are
definitely safe to be mapped without write-protection (see
can_change_pte_writable()) to be writable.  In the future, we might want
to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more
locations, for example, also when removing uffd-wp protection.

This fixes two known cases:

(a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting
    in uffd-wp not triggering on write access.
(b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs
    writable, resulting in uffd-wp not triggering on write access.

Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even
without NUMA hinting (which currently doesn't seem to be applicable to
shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA.  On
such a VMA, userfaultfd-wp is currently non-functional.

Note that when enabling userfaultfd-wp, there is no need to walk page
tables to enforce the new default protection for the PTEs: we know that
they cannot be uffd-wp'ed yet, because that can only happen after enabling
uffd-wp for the VMA in general.

Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not
accidentally set the write bit -- which would result in uffd-wp not
triggering on later write access.  This commit makes uffd-wp on shmem
behave just like uffd-wp on anonymous memory in that regard, even though,
mixing mprotect with uffd-wp is controversial.

[1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com

Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com


Fixes: b1f9e876 ("mm/uffd: enable write protection for shmem & hugetlbfs")
Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
Reported-by: default avatarIves van Hoorne <ives@codesandbox.io>
Debugged-by: default avatarPeter Xu <peterx@redhat.com>
Acked-by: default avatarPeter Xu <peterx@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent ab0c3f12
Loading
Loading
Loading
Loading
+22 −6
Original line number Original line Diff line number Diff line
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
	return ctx->features & UFFD_FEATURE_INITIALIZED;
	return ctx->features & UFFD_FEATURE_INITIALIZED;
}
}


static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
				     vm_flags_t flags)
{
	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;

	vma->vm_flags = flags;
	/*
	 * For shared mappings, we want to enable writenotify while
	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
	 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
	 */
	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
		vma_set_page_prot(vma);
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
				     int wake_flags, void *key)
				     int wake_flags, void *key)
{
{
@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
		for_each_vma(vmi, vma) {
		for_each_vma(vmi, vma) {
			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				vma->vm_flags &= ~__VM_UFFD_FLAGS;
				userfaultfd_set_vm_flags(vma,
							 vma->vm_flags & ~__VM_UFFD_FLAGS);
			}
			}
		}
		}
		mmap_write_unlock(mm);
		mmap_write_unlock(mm);
@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
	octx = vma->vm_userfaultfd_ctx.ctx;
	octx = vma->vm_userfaultfd_ctx.ctx;
	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_flags &= ~__VM_UFFD_FLAGS;
		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
		return 0;
		return 0;
	}
	}


@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
	} else {
	} else {
		/* Drop uffd context if remap feature not enabled */
		/* Drop uffd context if remap feature not enabled */
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_flags &= ~__VM_UFFD_FLAGS;
		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
	}
	}
}
}


@@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
			prev = vma;
			prev = vma;
		}
		}


		vma->vm_flags = new_flags;
		userfaultfd_set_vm_flags(vma, new_flags);
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
	}
	}
	mmap_write_unlock(mm);
	mmap_write_unlock(mm);
@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
		 * the next vma was merged into the current one and
		 * the next vma was merged into the current one and
		 * the current one has not been updated yet.
		 * the current one has not been updated yet.
		 */
		 */
		vma->vm_flags = new_flags;
		userfaultfd_set_vm_flags(vma, new_flags);
		vma->vm_userfaultfd_ctx.ctx = ctx;
		vma->vm_userfaultfd_ctx.ctx = ctx;


		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
		 * the next vma was merged into the current one and
		 * the next vma was merged into the current one and
		 * the current one has not been updated yet.
		 * the current one has not been updated yet.
		 */
		 */
		vma->vm_flags = new_flags;
		userfaultfd_set_vm_flags(vma, new_flags);
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;


	skip:
	skip:
+4 −0
Original line number Original line Diff line number Diff line
@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
		return 1;
		return 1;


	/* Do we need write faults for uffd-wp tracking? */
	if (userfaultfd_wp(vma))
		return 1;

	/* Specialty mapping? */
	/* Specialty mapping? */
	if (vm_flags & VM_PFNMAP)
	if (vm_flags & VM_PFNMAP)
		return 0;
		return 0;