Newer
Older
Greg Kroah-Hartman
committed
// SPDX-License-Identifier: GPL-2.0
/*
* Core of Xen paravirt_ops implementation.
*
* This file contains the xen_paravirt_ops structure itself, and the
* implementations for:
* - privileged instructions
* - interrupt flags
* - segment operations
* - booting and setup
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/preempt.h>
#include <linux/hardirq.h>
#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/edd.h>
#include <linux/frame.h>
#include <xen/xen.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
#include <xen/interface/nmi.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvc-console.h>
#include <xen/acpi.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
#include <asm/page.h>
#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/cpuid.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/msr-index.h>
#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
#ifdef CONFIG_X86_IOPL_IOPERM
#include <asm/io_bitmap.h>
#endif
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
#include <acpi/pdc_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
#include "pmu.h"
#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
struct tls_descs {
struct desc_struct desc[3];
};
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
* changed. Since Xen writes different descriptors than the one
* passed in the update_descriptor hypercall we keep shadow copies to
* compare against.
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static void __init xen_banner(void)
{
unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
#ifdef CONFIG_X86_32
pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"
"Support for running as 32-bit PV-guest under Xen will soon be removed\n"
"from the Linux kernel!\n"
"Please use either a 64-bit kernel or switch to HVM or PVH mode!\n"
"WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n");
#endif
static void __init xen_pv_init_platform(void)
{
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
/* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */
xen_vcpu_info_reset(0);
/* pvclock is in shared info area */
xen_init_time_ops();
}
static void __init xen_pv_guest_late_init(void)
{
#ifndef CONFIG_SMP
/* Setup shared vcpu info for non-smp configurations */
xen_setup_vcpu_info_placement();
#endif
}
/* Check if running on Xen version (major, minor) or later */
bool
xen_running_on_version_or_later(unsigned int major, unsigned int minor)
{
unsigned int version;
if (!xen_domain())
return false;
version = HYPERVISOR_xen_version(XENVER_version, NULL);
if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
((version >> 16) > major))
return true;
return false;
}
static __read_mostly unsigned int cpuid_leaf5_ecx_val;
static __read_mostly unsigned int cpuid_leaf5_edx_val;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
unsigned maskebx = ~0;
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
case CPUID_MWAIT_LEAF:
/* Synthesize the values.. */
*ax = 0;
*bx = 0;
*cx = cpuid_leaf5_ecx_val;
*dx = cpuid_leaf5_edx_val;
return;
case 0xb:
/* Suppress extended topology stuff */
maskebx = 0;
break;
}
asm(XEN_EMULATE_PREFIX "cpuid"
: "=a" (*ax),
"=b" (*bx),
"=c" (*cx),
"=d" (*dx)
: "0" (*ax), "2" (*cx));
*bx &= maskebx;
}
STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
static bool __init xen_check_mwait(void)
{
#ifdef CONFIG_ACPI
struct xen_platform_op op = {
.cmd = XENPF_set_processor_pminfo,
.u.set_pminfo.id = -1,
.u.set_pminfo.type = XEN_PM_PDC,
};
uint32_t buf[3];
unsigned int ax, bx, cx, dx;
unsigned int mwait_mask;
/* We need to determine whether it is OK to expose the MWAIT
* capability to the kernel to harvest deeper than C3 states from ACPI
* _CST using the processor_harvest_xen.c module. For this to work, we
* need to gather the MWAIT_LEAF values (which the cstate.c code
* checks against). The hypervisor won't expose the MWAIT flag because
* it would break backwards compatibility; so we will find out directly
* from the hardware and hypercall.
*/
if (!xen_initial_domain())
return false;
/*
* When running under platform earlier than Xen4.2, do not expose
* mwait, to avoid the risk of loading native acpi pad driver
*/
if (!xen_running_on_version_or_later(4, 2))
return false;
ax = 1;
cx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
(1 << (X86_FEATURE_MWAIT % 32));
if ((cx & mwait_mask) != mwait_mask)
return false;
/* We need to emulate the MWAIT_LEAF and for that we need both
* ecx and edx. The hypercall provides only partial information.
*/
ax = CPUID_MWAIT_LEAF;
bx = 0;
cx = 0;
dx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
(buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
return true;
#else
return false;
#endif
}
static bool __init xen_check_xsave(void)
{
cx = cpuid_ecx(1);
xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
(1 << (X86_FEATURE_OSXSAVE % 32));
/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
return (cx & xsave_mask) == xsave_mask;
static void __init xen_init_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_XENPV);
setup_clear_cpu_cap(X86_FEATURE_DCA);
setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
setup_clear_cpu_cap(X86_FEATURE_MTRR);
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
* as xen_flush_tlb_others() would need updating.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
if (!xen_initial_domain())
setup_clear_cpu_cap(X86_FEATURE_ACPI);
if (xen_check_mwait())
setup_force_cpu_cap(X86_FEATURE_MWAIT);
else
setup_clear_cpu_cap(X86_FEATURE_MWAIT);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
}
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
static void xen_set_debugreg(int reg, unsigned long val)
{
HYPERVISOR_set_debugreg(reg, val);
}
static unsigned long xen_get_debugreg(int reg)
{
return HYPERVISOR_get_debugreg(reg);
}
static void xen_end_context_switch(struct task_struct *next)
{
xen_mc_flush();
paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
{
return 0;
}
/*
* Set the page permissions for a particular virtual address. If the
* address is a vmalloc mapping (or other non-linear mapping), then
* find the linear mapping of the page and also set its protections to
* match.
*/
static void set_aliased_prot(void *v, pgprot_t prot)
{
int level;
pte_t *ptep;
pte_t pte;
unsigned long pfn;
struct page *page;
unsigned char dummy;
ptep = lookup_address((unsigned long)v, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
page = pfn_to_page(pfn);
pte = pfn_pte(pfn, prot);
/*
* Careful: update_va_mapping() will fail if the virtual address
* we're poking isn't populated in the page tables. We don't
* need to worry about the direct map (that's always in the page
* tables), but we need to be careful about vmap space. In
* particular, the top level page table can lazily propagate
* entries between processes, so if we've switched mms since we
* vmapped the target in the first place, we might not have the
* top-level page table entry populated.
*
* We disable preemption because we want the same mm active when
* we probe the target and when we issue the hypercall. We'll
* have the same nominal mm, but if we're a kernel thread, lazy
* mm dropping could change our pgd.
*
* Out of an abundance of caution, this uses __get_user() to fault
* in the target address just in case there's some obscure case
* in which the target address isn't readable.
*/
preempt_disable();
Christoph Hellwig
committed
copy_from_kernel_nofault(&dummy, v, 1);
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();
if (!PageHighMem(page)) {
void *av = __va(PFN_PHYS(pfn));
if (av != v)
if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
BUG();
} else
kmap_flush_unused();
preempt_enable();
}
static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
/*
* We need to mark the all aliases of the LDT pages RO. We
* don't need to call vm_flush_aliases(), though, since that's
* only responsible for flushing aliases out the TLBs, not the
* page tables, and Xen will flush the TLB for us if needed.
*
* To avoid confusing future readers: none of this is necessary
* to load the LDT. The hypervisor only checks this when the
* LDT is faulted in due to subsequent descriptor access.
*/
for (i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
{
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
for (i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL);
}
static void xen_set_ldt(const void *addr, unsigned entries)
{
struct mmuext_op *op;
struct multicall_space mcs = xen_mc_entry(sizeof(*op));
trace_xen_cpu_set_ldt(addr, entries);
op = mcs.args;
op->cmd = MMUEXT_SET_LDT;
op->arg1.linear_addr = (unsigned long)addr;
op->arg2.nr_ents = entries;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
static void xen_load_gdt(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned long pfn, mfn;
int level;
pte_t *ptep;
void *virt;
/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
* The GDT is per-cpu and is in the percpu data area.
* That can be virtually mapped, so we need to do a
* page-walk to get the underlying MFN for the
* hypercall. The page can also be in the kernel's
* linear range, so we need to RO that mapping too.
ptep = lookup_address(va, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
mfn = pfn_to_mfn(pfn);
virt = __va(PFN_PHYS(pfn));
make_lowmem_page_readonly((void *)va);
make_lowmem_page_readonly(virt);
if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
BUG();
}
/*
* load_gdt for early boot, when the gdt is only mapped once
*/
static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
/* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
BUG_ON(size > PAGE_SIZE);
pfn = virt_to_pfn(va);
mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
BUG();
if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
BUG();
}
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
return !memcmp(d1, d2, sizeof(*d1));
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
struct desc_struct *gdt;
xmaddr_t maddr;
struct multicall_space mc;
if (desc_equal(shadow, &t->tls_array[i]))
return;
*shadow = t->tls_array[i];
gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone
* and lazy gs handling is enabled, it means we're in a
* context switch, and %gs has just been saved. This means we
* can zero it out to prevent faults on exit from the
* hypervisor if the next process has no %gs. Either way, it
* has been saved, and the new value will get loaded properly.
* This will go away as soon as Xen has been modified to not
* save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
* must not zero %gs on x86_64
*
* For x86_64, we need to zero %fs, otherwise we may get an
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
}
xen_mc_batch();
load_TLS_descriptor(t, cpu, 0);
load_TLS_descriptor(t, cpu, 1);
load_TLS_descriptor(t, cpu, 2);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
#ifdef CONFIG_X86_64
static void xen_load_gs_index(unsigned int idx)
{
if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
BUG();
}
#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
{
xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
preempt_disable();
xen_mc_flush();
if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
BUG();
preempt_enable();
}
#ifdef CONFIG_X86_64
void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
/* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
exc_nmi(regs);
}
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
* There's no IST on Xen PV, but we still need to dispatch
* to the correct handler.
*/
if (user_mode(regs))
noist_exc_debug(regs);
else
exc_debug(regs);
}
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
bool ist_okay;
};
#define TRAP_ENTRY(func, ist_ok) { \
.orig = asm_##func, \
.xen = xen_asm_##func, \
.ist_okay = ist_ok }
#define TRAP_ENTRY_REDIR(func, ist_ok) { \
.xen = xen_asm_xenpv_##func, \
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
TRAP_ENTRY(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
TRAP_ENTRY(exc_machine_check, true ),
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
{ entry_INT80_compat, xen_entry_INT80_compat, false },
#endif
TRAP_ENTRY(exc_page_fault, false ),
TRAP_ENTRY(exc_divide_error, false ),
TRAP_ENTRY(exc_bounds, false ),
TRAP_ENTRY(exc_invalid_op, false ),
TRAP_ENTRY(exc_device_not_available, false ),
TRAP_ENTRY(exc_coproc_segment_overrun, false ),
TRAP_ENTRY(exc_invalid_tss, false ),
TRAP_ENTRY(exc_segment_not_present, false ),
TRAP_ENTRY(exc_stack_segment, false ),
TRAP_ENTRY(exc_general_protection, false ),
TRAP_ENTRY(exc_spurious_interrupt_bug, false ),
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
static bool __ref get_trap_addr(void **addr, unsigned int ist)
{
unsigned int nr;
bool ist_okay = false;
/*
* Replace trap handler addresses by Xen specific ones.
* Check for known traps using IST and whitelist them.
* The debugger ones are the only ones we care about.
* Xen will handle faults like double_fault, so we should never see
* them. Warn if there's an unexpected IST-using fault handler.
*/
for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
struct trap_array_entry *entry = trap_array + nr;
if (*addr == entry->orig) {
*addr = entry->xen;
ist_okay = entry->ist_okay;
break;
}
}
if (nr == ARRAY_SIZE(trap_array) &&
*addr >= (void *)early_idt_handler_array[0] &&
*addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) {
nr = (*addr - (void *)early_idt_handler_array[0]) /
EARLY_IDT_HANDLER_SIZE;
*addr = (void *)xen_early_idt_handler_array[nr];
}
if (WARN_ON(ist != 0 && !ist_okay))
return false;
return true;
}
#endif
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
unsigned long addr;
if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
return 0;
info->vector = vector;
addr = gate_offset(val);
if (!get_trap_addr((void **)&addr, val->bits.ist))
return 0;
#endif /* CONFIG_X86_64 */
info->address = addr;
info->cs = gate_segment(val);
info->flags = val->bits.dpl;
if (val->bits.type == GATE_INTERRUPT)
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
info->flags |= 1 << 2;
return 1;
}
/* Locations of each CPU's IDT */
static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
/* Set an IDT entry. If the entry is part of the current IDT, then
also update Xen. */
static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
{
unsigned long p = (unsigned long)&dt[entrynum];
unsigned long start, end;
trace_xen_cpu_write_idt_entry(dt, entrynum, g);
preempt_disable();
start = __this_cpu_read(idt_desc.address);
end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
native_write_idt_entry(dt, entrynum, g);
if (p >= start && (p + 8) <= end) {
struct trap_info info[2];
info[1].address = 0;
if (cvt_gate_to_trap(entrynum, g, &info[0]))
if (HYPERVISOR_set_trap_table(info))
BUG();
}
preempt_enable();
}
static void xen_convert_trap_info(const struct desc_ptr *desc,
struct trap_info *traps)
{
unsigned in, out, count;
count = (desc->size+1) / sizeof(gate_desc);
BUG_ON(count > 256);
for (in = out = 0; in < count; in++) {
gate_desc *entry = (gate_desc *)(desc->address) + in;
if (cvt_gate_to_trap(in, entry, &traps[out]))
out++;
}
traps[out].address = 0;
}
void xen_copy_trap_info(struct trap_info *traps)
{
const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
xen_convert_trap_info(desc, traps);
}
/* Load a new IDT into Xen. In principle this can be per-CPU, so we
hold a spinlock to protect the static traps[] array (static because
it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
trace_xen_cpu_load_idt(desc);
spin_lock(&lock);
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
xen_convert_trap_info(desc, traps);
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
BUG();
spin_unlock(&lock);
}
/* Write a GDT descriptor entry. Ignore LDT descriptors, since
they're handled differently. */
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
const void *desc, int type)
{
trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
preempt_disable();
switch (type) {
case DESC_LDT:
case DESC_TSS:
/* ignore */
break;
default: {
xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
xen_mc_flush();
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
BUG();
}
}
preempt_enable();
}
/*
* Version of write_gdt_entry for use at early boot-time needed to
* update an entry as simply as possible.
*/
static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
const void *desc, int type)
{
trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
switch (type) {
case DESC_LDT:
case DESC_TSS:
/* ignore */
break;
default: {
xmaddr_t maddr = virt_to_machine(&dt[entry]);
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
dt[entry] = *(struct desc_struct *)desc;
}
}
}
static void xen_load_sp0(unsigned long sp0)
{
struct multicall_space mcs;
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
#ifdef CONFIG_X86_IOPL_IOPERM
static void xen_invalidate_io_bitmap(void)
{
struct physdev_set_iobitmap iobitmap = {
.bitmap = 0,
.nr_ports = 0,
};
native_tss_invalidate_io_bitmap();
HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
}
static void xen_update_io_bitmap(void)
{
struct physdev_set_iobitmap iobitmap;
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
native_tss_update_io_bitmap();
iobitmap.bitmap = (uint8_t *)(&tss->x86_tss) +
tss->x86_tss.io_bitmap_base;
if (tss->x86_tss.io_bitmap_base == IO_BITMAP_OFFSET_INVALID)
iobitmap.nr_ports = 0;
else
iobitmap.nr_ports = IO_BITMAP_BITS;
HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
}
#endif
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
static void xen_io_delay(void)
{
}
static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
static unsigned long xen_read_cr0(void)
{
unsigned long cr0 = this_cpu_read(xen_cr0_value);
if (unlikely(cr0 == 0)) {
cr0 = native_read_cr0();
this_cpu_write(xen_cr0_value, cr0);
}
return cr0;
}
static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
this_cpu_write(xen_cr0_value, cr0);
/* Only pay attention to cr0.TS; everything else is
ignored. */
mcs = xen_mc_entry(0);
MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
if (pmu_msr_read(msr, &val, err))
return val;
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
break;
}
return val;
}
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
#ifdef CONFIG_X86_64
unsigned int which;
u64 base;
#endif
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
ret = 0;
switch (msr) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE: which = SEGBASE_FS; goto set;
case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
set:
base = ((u64)high << 32) | low;
if (HYPERVISOR_set_segment_base(which, base) != 0)
ret = -EIO;
break;
#endif
case MSR_STAR:
case MSR_CSTAR:
case MSR_LSTAR:
case MSR_SYSCALL_MASK:
case MSR_IA32_SYSENTER_CS:
case MSR_IA32_SYSENTER_ESP:
case MSR_IA32_SYSENTER_EIP:
/* Fast syscall setup is all done in hypercalls, so
these are all ignored. Stub them out here to stop
Xen console noise. */
break;
default:
if (!pmu_msr_write(msr, low, high, &ret))
ret = native_write_msr_safe(msr, low, high);
}
return ret;
}