iommu/vt-d: Add basic SVM PASID support (2f26e0a9) · Commits · jan.koester / Linux

drivers/iommu/Kconfig

+1 −0

Original line number	Diff line number	Diff line
		@@ -139,6 +139,7 @@ config INTEL_IOMMU_SVM
		bool "Support for Shared Virtual Memory with Intel IOMMU"
		depends on INTEL_IOMMU && X86
		select PCI_PASID
		select MMU_NOTIFIER
		help
		Shared Virtual Memory (SVM) provides a facility for devices
		to access DMA resources through process address space by

drivers/iommu/intel-iommu.c

+104 −0

Original line number	Diff line number	Diff line
		@@ -4929,6 +4929,110 @@ static void intel_iommu_remove_device(struct device *dev)
		iommu_device_unlink(iommu->iommu_dev, dev);
		}

		#ifdef CONFIG_INTEL_IOMMU_SVM
		int intel_iommu_enable_pasid(struct intel_iommu iommu, struct intel_svm_dev sdev)
		{
		struct device_domain_info *info;
		struct context_entry *context;
		struct dmar_domain *domain;
		unsigned long flags;
		u64 ctx_lo;
		int ret;

		domain = get_valid_domain_for_dev(sdev->dev);
		if (!domain)
		return -EINVAL;

		spin_lock_irqsave(&device_domain_lock, flags);
		spin_lock(&iommu->lock);

		ret = -EINVAL;
		info = sdev->dev->archdata.iommu;
		if (!info \|\| !info->pasid_supported)
		goto out;

		context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
		if (WARN_ON(!context))
		goto out;

		ctx_lo = context[0].lo;

		sdev->did = domain->iommu_did[iommu->seq_id];
		sdev->sid = PCI_DEVID(info->bus, info->devfn);

		if (!(ctx_lo & CONTEXT_PASIDE)) {
		context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
		context[1].lo = (u64)virt_to_phys(iommu->pasid_table) \| ecap_pss(iommu->ecap);
		wmb();
		/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
		* extended to permit requests-with-PASID if the PASIDE bit
		* is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
		* however, the PASIDE bit is ignored and requests-with-PASID
		* are unconditionally blocked. Which makes less sense.
		* So convert from CONTEXT_TT_PASS_THROUGH to one of the new
		* "guest mode" translation types depending on whether ATS
		* is available or not. Annoyingly, we can't use the new
		* modes unless PASIDE is set. */
		if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
		ctx_lo &= ~CONTEXT_TT_MASK;
		if (info->ats_supported)
		ctx_lo \|= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
		else
		ctx_lo \|= CONTEXT_TT_PT_PASID << 2;
		}
		ctx_lo \|= CONTEXT_PASIDE;
		context[0].lo = ctx_lo;
		wmb();
		iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
		DMA_CCMD_MASK_NOBIT,
		DMA_CCMD_DEVICE_INVL);
		}

		/* Enable PASID support in the device, if it wasn't already */
		if (!info->pasid_enabled)
		iommu_enable_dev_iotlb(info);

		if (info->ats_enabled) {
		sdev->dev_iotlb = 1;
		sdev->qdep = info->ats_qdep;
		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
		sdev->qdep = 0;
		}
		ret = 0;

		out:
		spin_unlock(&iommu->lock);
		spin_unlock_irqrestore(&device_domain_lock, flags);

		return ret;
		}

		struct intel_iommu intel_svm_device_to_iommu(struct device dev)
		{
		struct intel_iommu *iommu;
		u8 bus, devfn;

		if (iommu_dummy(dev)) {
		dev_warn(dev,
		"No IOMMU translation for device; cannot enable SVM\n");
		return NULL;
		}

		iommu = device_to_iommu(dev, &bus, &devfn);
		if ((!iommu)) {
		dev_dbg(dev, "No IOMMU for device; cannot enable SVM\n");
		return NULL;
		}

		if (!iommu->pasid_table) {
		dev_dbg(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
		return NULL;
		}

		return iommu;
		}
		#endif /* CONFIG_INTEL_IOMMU_SVM */

		static const struct iommu_ops intel_iommu_ops = {
		.capable = intel_iommu_capable,
		.domain_alloc = intel_iommu_domain_alloc,

drivers/iommu/intel-svm.c

+291 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,17 @@
		*/

		#include <linux/intel-iommu.h>
		#include <linux/mmu_notifier.h>
		#include <linux/sched.h>
		#include <linux/slab.h>
		#include <linux/intel-svm.h>
		#include <linux/rculist.h>
		#include <linux/pci.h>
		#include <linux/pci-ats.h>

		struct pasid_entry {
		u64 val;
		};

		int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
		{
		@@ -42,6 +53,8 @@ int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
		iommu->name);
		}

		idr_init(&iommu->pasid_idr);

		return 0;
		}

		@@ -61,5 +74,283 @@ int intel_svm_free_pasid_tables(struct intel_iommu *iommu)
		free_pages((unsigned long)iommu->pasid_state_table, order);
		iommu->pasid_state_table = NULL;
		}
		idr_destroy(&iommu->pasid_idr);
		return 0;
		}

		static void intel_flush_svm_range_dev (struct intel_svm svm, struct intel_svm_dev sdev,
		unsigned long address, int pages, int ih)
		{
		struct qi_desc desc;
		int mask = ilog2(__roundup_pow_of_two(pages));

		if (pages == -1 \|\| !cap_pgsel_inv(svm->iommu->cap) \|\|
		mask > cap_max_amask_val(svm->iommu->cap)) {
		desc.low = QI_EIOTLB_PASID(svm->pasid) \| QI_EIOTLB_DID(sdev->did) \|
		QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) \| QI_EIOTLB_TYPE;
		desc.high = 0;
		} else {
		desc.low = QI_EIOTLB_PASID(svm->pasid) \| QI_EIOTLB_DID(sdev->did) \|
		QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) \| QI_EIOTLB_TYPE;
		desc.high = QI_EIOTLB_ADDR(address) \| QI_EIOTLB_GL(1) \|
		QI_EIOTLB_IH(ih) \| QI_EIOTLB_AM(mask);
		}

		qi_submit_sync(&desc, svm->iommu);

		if (sdev->dev_iotlb) {
		desc.low = QI_DEV_EIOTLB_PASID(svm->pasid) \| QI_DEV_EIOTLB_SID(sdev->sid) \|
		QI_DEV_EIOTLB_QDEP(sdev->qdep) \| QI_DEIOTLB_TYPE;
		if (mask) {
		unsigned long adr, delta;

		/* Least significant zero bits in the address indicate the
		* range of the request. So mask them out according to the
		* size. */
		adr = address & ((1<<(VTD_PAGE_SHIFT + mask)) - 1);

		/* Now ensure that we round down further if the original
		* request was not aligned w.r.t. its size */
		delta = address - adr;
		if (delta + (pages << VTD_PAGE_SHIFT) >= (1 << (VTD_PAGE_SHIFT + mask)))
		adr &= ~(1 << (VTD_PAGE_SHIFT + mask));
		desc.high = QI_DEV_EIOTLB_ADDR(adr) \| QI_DEV_EIOTLB_SIZE;
		} else {
		desc.high = QI_DEV_EIOTLB_ADDR(address);
		}
		qi_submit_sync(&desc, svm->iommu);
		}
		}

		static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
		int pages, int ih)
		{
		struct intel_svm_dev *sdev;

		rcu_read_lock();
		list_for_each_entry_rcu(sdev, &svm->devs, list)
		intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
		rcu_read_unlock();
		}

		static void intel_change_pte(struct mmu_notifier mn, struct mm_struct mm,
		unsigned long address, pte_t pte)
		{
		struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

		intel_flush_svm_range(svm, address, 1, 1);
		}

		static void intel_invalidate_page(struct mmu_notifier mn, struct mm_struct mm,
		unsigned long address)
		{
		struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

		intel_flush_svm_range(svm, address, 1, 1);
		}

		/* Pages have been freed at this point */
		static void intel_invalidate_range(struct mmu_notifier *mn,
		struct mm_struct *mm,
		unsigned long start, unsigned long end)
		{
		struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

		intel_flush_svm_range(svm, start,
		(end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT , 0);
		}


		static void intel_flush_pasid_dev(struct intel_svm svm, struct intel_svm_dev sdev)
		{
		struct qi_desc desc;

		desc.high = 0;
		desc.low = QI_PC_TYPE \| QI_PC_DID(sdev->did) \| QI_PC_PASID_SEL \| QI_PC_PASID(svm->pasid);

		qi_submit_sync(&desc, svm->iommu);
		}

		static void intel_mm_release(struct mmu_notifier mn, struct mm_struct mm)
		{
		struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);

		svm->iommu->pasid_table[svm->pasid].val = 0;

		/* There's no need to do any flush because we can't get here if there
		* are any devices left anyway. */
		WARN_ON(!list_empty(&svm->devs));
		}

		static const struct mmu_notifier_ops intel_mmuops = {
		.release = intel_mm_release,
		.change_pte = intel_change_pte,
		.invalidate_page = intel_invalidate_page,
		.invalidate_range = intel_invalidate_range,
		};

		static DEFINE_MUTEX(pasid_mutex);

		int intel_svm_bind_mm(struct device dev, int pasid)
		{
		struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
		struct intel_svm_dev *sdev;
		struct intel_svm *svm = NULL;
		int pasid_max;
		int ret;

		BUG_ON(pasid && !current->mm);

		if (WARN_ON(!iommu))
		return -EINVAL;

		if (dev_is_pci(dev)) {
		pasid_max = pci_max_pasids(to_pci_dev(dev));
		if (pasid_max < 0)
		return -EINVAL;
		} else
		pasid_max = 1 << 20;

		mutex_lock(&pasid_mutex);
		if (pasid) {
		int i;

		idr_for_each_entry(&iommu->pasid_idr, svm, i) {
		if (svm->mm != current->mm)
		continue;

		if (svm->pasid >= pasid_max) {
		dev_warn(dev,
		"Limited PASID width. Cannot use existing PASID %d\n",
		svm->pasid);
		ret = -ENOSPC;
		goto out;
		}

		list_for_each_entry(sdev, &svm->devs, list) {
		if (dev == sdev->dev) {
		sdev->users++;
		goto success;
		}
		}

		break;
		}
		}

		sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
		if (!sdev) {
		ret = -ENOMEM;
		goto out;
		}
		sdev->dev = dev;

		ret = intel_iommu_enable_pasid(iommu, sdev);
		if (ret \|\| !pasid) {
		/* If they don't actually want to assign a PASID, this is
		* just an enabling check/preparation. */
		kfree(sdev);
		goto out;
		}
		/* Finish the setup now we know we're keeping it */
		sdev->users = 1;
		init_rcu_head(&sdev->rcu);

		if (!svm) {
		svm = kzalloc(sizeof(*svm), GFP_KERNEL);
		if (!svm) {
		ret = -ENOMEM;
		kfree(sdev);
		goto out;
		}
		svm->iommu = iommu;

		if (pasid_max > 2 << ecap_pss(iommu->ecap))
		pasid_max = 2 << ecap_pss(iommu->ecap);

		ret = idr_alloc(&iommu->pasid_idr, svm, 0, pasid_max - 1,
		GFP_KERNEL);
		if (ret < 0) {
		kfree(svm);
		goto out;
		}
		svm->pasid = ret;
		svm->notifier.ops = &intel_mmuops;
		svm->mm = get_task_mm(current);
		INIT_LIST_HEAD_RCU(&svm->devs);
		ret = -ENOMEM;
		if (!svm->mm \|\| (ret = mmu_notifier_register(&svm->notifier, svm->mm))) {
		idr_remove(&svm->iommu->pasid_idr, svm->pasid);
		kfree(svm);
		kfree(sdev);
		goto out;
		}
		iommu->pasid_table[svm->pasid].val = (u64)__pa(svm->mm->pgd) \| 1;
		wmb();
		}
		list_add_rcu(&sdev->list, &svm->devs);

		success:
		*pasid = svm->pasid;
		ret = 0;
		out:
		mutex_unlock(&pasid_mutex);
		return ret;
		}
		EXPORT_SYMBOL_GPL(intel_svm_bind_mm);

		int intel_svm_unbind_mm(struct device *dev, int pasid)
		{
		struct intel_svm_dev *sdev;
		struct intel_iommu *iommu;
		struct intel_svm *svm;
		int ret = -EINVAL;

		mutex_lock(&pasid_mutex);
		iommu = intel_svm_device_to_iommu(dev);
		if (!iommu \|\| !iommu->pasid_table)
		goto out;

		svm = idr_find(&iommu->pasid_idr, pasid);
		if (!svm)
		goto out;

		list_for_each_entry(sdev, &svm->devs, list) {
		if (dev == sdev->dev) {
		ret = 0;
		sdev->users--;
		if (!sdev->users) {
		list_del_rcu(&sdev->list);
		/* Flush the PASID cache and IOTLB for this device.
		* Note that we do depend on the hardware not using
		* the PASID any more. Just as we depend on other
		* devices never using PASIDs that they have no right
		* to use. We have a shared PASID table, because it's
		* large and has to be physically contiguous. So it's
		* hard to be as defensive as we might like. */
		intel_flush_pasid_dev(svm, sdev);
		intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
		kfree_rcu(sdev, rcu);

		if (list_empty(&svm->devs)) {
		mmu_notifier_unregister(&svm->notifier, svm->mm);

		idr_remove(&svm->iommu->pasid_idr, svm->pasid);
		mmput(svm->mm);
		/* We mandate that no page faults may be outstanding
		* for the PASID when intel_svm_unbind_mm() is called.
		* If that is not obeyed, subtle errors will happen.
		* Let's make them less subtle... */
		memset(svm, 0x6b, sizeof(*svm));
		kfree(svm);
		}
		}
		break;
		}
		}
		out:
		mutex_unlock(&pasid_mutex);

		return ret;
		}
		EXPORT_SYMBOL_GPL(intel_svm_unbind_mm);

include/linux/dma_remapping.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,13 @@
		#define CONTEXT_TT_MULTI_LEVEL 0
		#define CONTEXT_TT_DEV_IOTLB 1
		#define CONTEXT_TT_PASS_THROUGH 2
		/* Extended context entry types */
		#define CONTEXT_TT_PT_PASID 4
		#define CONTEXT_TT_PT_PASID_DEV_IOTLB 5
		#define CONTEXT_TT_MASK (7ULL << 2)

		#define CONTEXT_PRS (1ULL << 9)
		#define CONTEXT_PASIDE (1ULL << 11)

		struct intel_iommu;
		struct dmar_domain;

include/linux/intel-iommu.h

+63 −5

Original line number	Diff line number	Diff line
		/*
		* Copyright (c) 2006, Intel Corporation.
		* Copyright © 2006-2015, Intel Corporation.
		*
		* Authors: Ashok Raj <ashok.raj@intel.com>
		* Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
		* David Woodhouse <David.Woodhouse@intel.com>
		*
		* This program is free software; you can redistribute it and/or modify it
		* under the terms and conditions of the GNU General Public License,
		@@ -13,10 +17,6 @@
		* You should have received a copy of the GNU General Public License along with
		* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
		* Place - Suite 330, Boston, MA 02111-1307 USA.
		*
		* Copyright (C) 2006-2008 Intel Corporation
		* Author: Ashok Raj <ashok.raj@intel.com>
		* Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
		*/

		#ifndef _INTEL_IOMMU_H_
		@@ -25,7 +25,10 @@
		#include <linux/types.h>
		#include <linux/iova.h>
		#include <linux/io.h>
		#include <linux/idr.h>
		#include <linux/dma_remapping.h>
		#include <linux/mmu_notifier.h>
		#include <linux/list.h>
		#include <asm/cacheflush.h>
		#include <asm/iommu.h>

		@@ -251,6 +254,9 @@ enum {
		#define QI_DIOTLB_TYPE 0x3
		#define QI_IEC_TYPE 0x4
		#define QI_IWD_TYPE 0x5
		#define QI_EIOTLB_TYPE 0x6
		#define QI_PC_TYPE 0x7
		#define QI_DEIOTLB_TYPE 0x8

		#define QI_IEC_SELECTIVE (((u64)1) << 4)
		#define QI_IEC_IIDEX(idx) (((u64)(idx & 0xffff) << 32))
		@@ -278,6 +284,34 @@ enum {
		#define QI_DEV_IOTLB_SIZE 1
		#define QI_DEV_IOTLB_MAX_INVS 32

		#define QI_PC_PASID(pasid) (((u64)pasid) << 32)
		#define QI_PC_DID(did) (((u64)did) << 16)
		#define QI_PC_GRAN(gran) (((u64)gran) << 4)

		#define QI_PC_ALL_PASIDS (QI_PC_TYPE \| QI_PC_GRAN(0))
		#define QI_PC_PASID_SEL (QI_PC_TYPE \| QI_PC_GRAN(1))

		#define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK)
		#define QI_EIOTLB_GL(gl) (((u64)gl) << 7)
		#define QI_EIOTLB_IH(ih) (((u64)ih) << 6)
		#define QI_EIOTLB_AM(am) (((u64)am))
		#define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32)
		#define QI_EIOTLB_DID(did) (((u64)did) << 16)
		#define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4)

		#define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK)
		#define QI_DEV_EIOTLB_SIZE (((u64)1) << 11)
		#define QI_DEV_EIOTLB_GLOB(g) ((u64)g)
		#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
		#define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32)
		#define QI_DEV_EIOTLB_QDEP(qd) (((qd) & 0x1f) << 16)
		#define QI_DEV_EIOTLB_MAX_INVS 32

		#define QI_GRAN_ALL_ALL 0
		#define QI_GRAN_NONG_ALL 1
		#define QI_GRAN_NONG_PASID 2
		#define QI_GRAN_PSI_PASID 3

		struct qi_desc {
		u64 low, high;
		};
		@@ -359,6 +393,7 @@ struct intel_iommu {
		* told to. But while it's all driver-arbitrated, we're fine. */
		struct pasid_entry *pasid_table;
		struct pasid_state_entry *pasid_state_table;
		struct idr pasid_idr;
		#endif
		struct q_inval qi; / Queued invalidation info */
		u32 iommu_state; / Store iommu states between suspend and resume.*/
		@@ -399,9 +434,32 @@ extern int qi_submit_sync(struct qi_desc desc, struct intel_iommu iommu);

		extern int dmar_ir_support(void);

		#ifdef CONFIG_INTEL_IOMMU_SVM
		extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu);
		extern int intel_svm_free_pasid_tables(struct intel_iommu *iommu);

		struct intel_svm_dev {
		struct list_head list;
		struct rcu_head rcu;
		struct device *dev;
		int users;
		u16 did;
		u16 dev_iotlb:1;
		u16 sid, qdep;
		};

		struct intel_svm {
		struct mmu_notifier notifier;
		struct mm_struct *mm;
		struct intel_iommu *iommu;
		int pasid;
		struct list_head devs;
		};

		extern int intel_iommu_enable_pasid(struct intel_iommu iommu, struct intel_svm_dev sdev);
		extern struct intel_iommu intel_svm_device_to_iommu(struct device dev);
		#endif

		extern const struct attribute_group *intel_iommu_groups[];

		#endif