Merge tag 'x86_mm_for_v6.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (92598ae2) · Commits · jan.koester / Linux

Documentation/core-api/protection-keys.rst

+21 −23

Original line number	Diff line number	Diff line
		@@ -4,31 +4,29 @@
		Memory Protection Keys
		======================

		Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
		which is found on Intel's Skylake (and later) "Scalable Processor"
		Server CPUs. It will be available in future non-server Intel parts
		and future AMD processors.

		For anyone wishing to test or use this feature, it is available in
		Amazon's EC2 C5 instances and is known to work there using an Ubuntu
		17.04 image.

		Memory Protection Keys provides a mechanism for enforcing page-based
		protections, but without requiring modification of the page tables
		when an application changes protection domains. It works by
		dedicating 4 previously ignored bits in each page table entry to a
		"protection key", giving 16 possible keys.

		There is also a new user-accessible register (PKRU) with two separate
		bits (Access Disable and Write Disable) for each key. Being a CPU
		register, PKRU is inherently thread-local, potentially giving each
		Memory Protection Keys provide a mechanism for enforcing page-based
		protections, but without requiring modification of the page tables when an
		application changes protection domains.

		Pkeys Userspace (PKU) is a feature which can be found on:
		* Intel server CPUs, Skylake and later
		* Intel client CPUs, Tiger Lake (11th Gen Core) and later
		* Future AMD CPUs

		Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
		a "protection key", giving 16 possible keys.

		Protections for each key are defined with a per-CPU user-accessible register
		(PKRU). Each of these is a 32-bit register storing two bits (Access Disable
		and Write Disable) for each of 16 keys.

		Being a CPU register, PKRU is inherently thread-local, potentially giving each
		thread a different set of protections from every other thread.

		There are two new instructions (RDPKRU/WRPKRU) for reading and writing
		to the new register. The feature is only available in 64-bit mode,
		even though there is theoretically space in the PAE PTEs. These
		permissions are enforced on data access only and have no effect on
		instruction fetches.
		There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
		register. The feature is only available in 64-bit mode, even though there is
		theoretically space in the PAE PTEs. These permissions are enforced on data
		access only and have no effect on instruction fetches.

		Syscalls
		========

arch/x86/include/asm/tlbflush.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@
		void __flush_tlb_all(void);

		#define TLB_FLUSH_ALL -1UL
		#define TLB_GENERATION_INVALID 0

		void cr4_update_irqsoff(unsigned long set, unsigned long clear);
		unsigned long cr4_read_shadow(void);

arch/x86/mm/pkeys.c

+9 −6

Original line number	Diff line number	Diff line
		@@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
		return vma_pkey(vma);
		}

		#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
		#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))

		/*
		* Make the default PKRU value (at execve() time) as restrictive
		@@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
		* in the process's lifetime will not accidentally get access
		* to data which is pkey-protected later on.
		*/
		u32 init_pkru_value = PKRU_AD_KEY( 1) \| PKRU_AD_KEY( 2) \| PKRU_AD_KEY( 3) \|
		PKRU_AD_KEY( 4) \| PKRU_AD_KEY( 5) \| PKRU_AD_KEY( 6) \|
		PKRU_AD_KEY( 7) \| PKRU_AD_KEY( 8) \| PKRU_AD_KEY( 9) \|
		PKRU_AD_KEY(10) \| PKRU_AD_KEY(11) \| PKRU_AD_KEY(12) \|
		PKRU_AD_KEY(13) \| PKRU_AD_KEY(14) \| PKRU_AD_KEY(15);
		u32 init_pkru_value = PKRU_AD_MASK( 1) \| PKRU_AD_MASK( 2) \|
		PKRU_AD_MASK( 3) \| PKRU_AD_MASK( 4) \|
		PKRU_AD_MASK( 5) \| PKRU_AD_MASK( 6) \|
		PKRU_AD_MASK( 7) \| PKRU_AD_MASK( 8) \|
		PKRU_AD_MASK( 9) \| PKRU_AD_MASK(10) \|
		PKRU_AD_MASK(11) \| PKRU_AD_MASK(12) \|
		PKRU_AD_MASK(13) \| PKRU_AD_MASK(14) \|
		PKRU_AD_MASK(15);

		static ssize_t init_pkru_read_file(struct file file, char __user user_buf,
		size_t count, loff_t *ppos)

arch/x86/mm/tlb.c

+28 −3

Original line number	Diff line number	Diff line
		@@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
		const struct flush_tlb_info *f = info;
		struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
		u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
		u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
		u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
		bool local = smp_processor_id() == f->initiating_cpu;
		unsigned long nr_invalidate = 0;
		u64 mm_tlb_gen;

		/* This code cannot presently handle being reentered. */
		VM_WARN_ON(!irqs_disabled());
		@@ -771,6 +771,23 @@ static void flush_tlb_func(void *info)
		return;
		}

		if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
		f->new_tlb_gen <= local_tlb_gen)) {
		/*
		* The TLB is already up to date in respect to f->new_tlb_gen.
		* While the core might be still behind mm_tlb_gen, checking
		* mm_tlb_gen unnecessarily would have negative caching effects
		* so avoid it.
		*/
		return;
		}

		/*
		* Defer mm_tlb_gen reading as long as possible to avoid cache
		* contention.
		*/
		mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);

		if (unlikely(local_tlb_gen == mm_tlb_gen)) {
		/*
		* There's nothing to do: we're already up to date. This can
		@@ -827,6 +844,12 @@ static void flush_tlb_func(void *info)
		/* Partial flush */
		unsigned long addr = f->start;

		/* Partial flush cannot have invalid generations */
		VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);

		/* Partial flush must have valid mm */
		VM_WARN_ON(f->mm == NULL);

		nr_invalidate = (f->end - f->start) >> f->stride_shift;

		while (addr < f->end) {
		@@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
		struct flush_tlb_info *info;

		preempt_disable();
		info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
		info = get_flush_tlb_info(NULL, start, end, 0, false,
		TLB_GENERATION_INVALID);

		on_each_cpu(do_kernel_range_flush, info, 1);

		@@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)

		int cpu = get_cpu();

		info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
		info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
		TLB_GENERATION_INVALID);
		/*
		* flush_tlb_multi() is not optimized for the common case in which only
		* a local TLB flush is needed. Optimize this use-case by calling