Commit 92598ae2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Borislav Petkov:

 - Rename a PKRU macro to make more sense when reading the code

 - Update pkeys documentation

 - Avoid reading contended mm's TLB generation var if not absolutely
   necessary along with fixing a case where arch_tlbbatch_flush()
   doesn't adhere to the generation scheme and thus violates the
   conditions for the above avoidance.

* tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/tlb: Ignore f->new_tlb_gen when zero
  x86/pkeys: Clarify PKRU_AD_KEY macro
  Documentation/protection-keys: Clean up documentation for User Space pkeys
  x86/mm/tlb: Avoid reading mm_tlb_gen when possible
parents 94e37e84 8f1d56f6
Loading
Loading
Loading
Loading
+21 −23
Original line number Original line Diff line number Diff line
@@ -4,31 +4,29 @@
Memory Protection Keys
Memory Protection Keys
======================
======================


Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
Memory Protection Keys provide a mechanism for enforcing page-based
which is found on Intel's Skylake (and later) "Scalable Processor"
protections, but without requiring modification of the page tables when an
Server CPUs. It will be available in future non-server Intel parts
application changes protection domains.
and future AMD processors.


Pkeys Userspace (PKU) is a feature which can be found on:
For anyone wishing to test or use this feature, it is available in
        * Intel server CPUs, Skylake and later
Amazon's EC2 C5 instances and is known to work there using an Ubuntu
        * Intel client CPUs, Tiger Lake (11th Gen Core) and later
17.04 image.
        * Future AMD CPUs


Memory Protection Keys provides a mechanism for enforcing page-based
Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
protections, but without requiring modification of the page tables
a "protection key", giving 16 possible keys.
when an application changes protection domains.  It works by

dedicating 4 previously ignored bits in each page table entry to a
Protections for each key are defined with a per-CPU user-accessible register
"protection key", giving 16 possible keys.
(PKRU).  Each of these is a 32-bit register storing two bits (Access Disable

and Write Disable) for each of 16 keys.
There is also a new user-accessible register (PKRU) with two separate

bits (Access Disable and Write Disable) for each key.  Being a CPU
Being a CPU register, PKRU is inherently thread-local, potentially giving each
register, PKRU is inherently thread-local, potentially giving each
thread a different set of protections from every other thread.
thread a different set of protections from every other thread.


There are two new instructions (RDPKRU/WRPKRU) for reading and writing
There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
to the new register.  The feature is only available in 64-bit mode,
register.  The feature is only available in 64-bit mode, even though there is
even though there is theoretically space in the PAE PTEs.  These
theoretically space in the PAE PTEs.  These permissions are enforced on data
permissions are enforced on data access only and have no effect on
access only and have no effect on instruction fetches.
instruction fetches.


Syscalls
Syscalls
========
========
+1 −0
Original line number Original line Diff line number Diff line
@@ -16,6 +16,7 @@
void __flush_tlb_all(void);
void __flush_tlb_all(void);


#define TLB_FLUSH_ALL	-1UL
#define TLB_FLUSH_ALL	-1UL
#define TLB_GENERATION_INVALID	0


void cr4_update_irqsoff(unsigned long set, unsigned long clear);
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);
unsigned long cr4_read_shadow(void);
+9 −6
Original line number Original line Diff line number Diff line
@@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
	return vma_pkey(vma);
	return vma_pkey(vma);
}
}


#define PKRU_AD_KEY(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
#define PKRU_AD_MASK(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))


/*
/*
 * Make the default PKRU value (at execve() time) as restrictive
 * Make the default PKRU value (at execve() time) as restrictive
@@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 * in the process's lifetime will not accidentally get access
 * in the process's lifetime will not accidentally get access
 * to data which is pkey-protected later on.
 * to data which is pkey-protected later on.
 */
 */
u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) |
		      PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
		      PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) |
		      PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
		      PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) |
		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
		      PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) |
		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
		      PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) |
		      PKRU_AD_MASK(11) | PKRU_AD_MASK(12) |
		      PKRU_AD_MASK(13) | PKRU_AD_MASK(14) |
		      PKRU_AD_MASK(15);


static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
			     size_t count, loff_t *ppos)
			     size_t count, loff_t *ppos)
+28 −3
Original line number Original line Diff line number Diff line
@@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
	const struct flush_tlb_info *f = info;
	const struct flush_tlb_info *f = info;
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
	bool local = smp_processor_id() == f->initiating_cpu;
	bool local = smp_processor_id() == f->initiating_cpu;
	unsigned long nr_invalidate = 0;
	unsigned long nr_invalidate = 0;
	u64 mm_tlb_gen;


	/* This code cannot presently handle being reentered. */
	/* This code cannot presently handle being reentered. */
	VM_WARN_ON(!irqs_disabled());
	VM_WARN_ON(!irqs_disabled());
@@ -771,6 +771,23 @@ static void flush_tlb_func(void *info)
		return;
		return;
	}
	}


	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
		     f->new_tlb_gen <= local_tlb_gen)) {
		/*
		 * The TLB is already up to date in respect to f->new_tlb_gen.
		 * While the core might be still behind mm_tlb_gen, checking
		 * mm_tlb_gen unnecessarily would have negative caching effects
		 * so avoid it.
		 */
		return;
	}

	/*
	 * Defer mm_tlb_gen reading as long as possible to avoid cache
	 * contention.
	 */
	mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);

	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
		/*
		/*
		 * There's nothing to do: we're already up to date.  This can
		 * There's nothing to do: we're already up to date.  This can
@@ -827,6 +844,12 @@ static void flush_tlb_func(void *info)
		/* Partial flush */
		/* Partial flush */
		unsigned long addr = f->start;
		unsigned long addr = f->start;


		/* Partial flush cannot have invalid generations */
		VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);

		/* Partial flush must have valid mm */
		VM_WARN_ON(f->mm == NULL);

		nr_invalidate = (f->end - f->start) >> f->stride_shift;
		nr_invalidate = (f->end - f->start) >> f->stride_shift;


		while (addr < f->end) {
		while (addr < f->end) {
@@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
		struct flush_tlb_info *info;
		struct flush_tlb_info *info;


		preempt_disable();
		preempt_disable();
		info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
		info = get_flush_tlb_info(NULL, start, end, 0, false,
					  TLB_GENERATION_INVALID);


		on_each_cpu(do_kernel_range_flush, info, 1);
		on_each_cpu(do_kernel_range_flush, info, 1);


@@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)


	int cpu = get_cpu();
	int cpu = get_cpu();


	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
				  TLB_GENERATION_INVALID);
	/*
	/*
	 * flush_tlb_multi() is not optimized for the common case in which only
	 * flush_tlb_multi() is not optimized for the common case in which only
	 * a local TLB flush is needed. Optimize this use-case by calling
	 * a local TLB flush is needed. Optimize this use-case by calling