/*
 * Copyright (c) 2007, 2008 University of Tsukuba
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of the University of Tsukuba nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * Copyright (c) 2010-2012 Yuichi Watanabe
 */

#include <core/assert.h>
#include <core/initfunc.h>
#include <core/linkage.h>
#include <core/printf.h>
#include <core/string.h>
#include "apic_pass.h"
#include "apic.h"
#include "asm.h"
#include "bitmap.h"
#include "constants.h"
#include "cpu.h"
#include "cpu_emul.h"
#include "cpu_mmu.h"
#include "current.h"
#include "extint.h"
#include "guest_boot.h"
#include "int.h"
#include "panic.h"
#include "pcpu.h"
#include "reboot.h"
#include "thread.h"
#include "vcpu.h"
#include "vmmcall.h"
#include "vmmcall_status.h"
#include "vt.h"
#include "vt_internal.h"

enum vt_status {
	VT_VMENTRY_SUCCESS,
	VT_VMENTRY_FAILED,
	VT_VMEXIT,
};

static u32 stat_intcnt = 0;
static u32 stat_hwexcnt = 0;
static u32 stat_swexcnt = 0;
static u32 stat_pfcnt = 0;
static u32 stat_iocnt = 0;
static u32 stat_hltcnt = 0;

static void
do_mov_cr (void)
{
	ulong val;
	union {
		struct exit_qual_cr s;
		ulong v;
	} eqc;

	asm_vmread (VMCS_EXIT_QUALIFICATION, &eqc.v);
	switch (eqc.s.type) {
	case EXIT_QUAL_CR_TYPE_MOV_TO_CR:
		vt_read_general_reg (eqc.s.reg, &val);
		vt_write_control_reg (eqc.s.num, val);
		break;
	case EXIT_QUAL_CR_TYPE_MOV_FROM_CR:
		vt_read_control_reg (eqc.s.num, &val);
		vt_write_general_reg (eqc.s.reg, val);
		break;
	case EXIT_QUAL_CR_TYPE_CLTS:
		vt_read_control_reg (CONTROL_REG_CR0, &val);
		val &= ~CR0_TS_BIT;
		vt_write_control_reg (CONTROL_REG_CR0, val);
		break;
	case EXIT_QUAL_CR_TYPE_LMSW:
		vt_read_control_reg (CONTROL_REG_CR0, &val);
		val &= ~0xFFFF;
		val |= eqc.s.lmsw_src;
		vt_write_control_reg (CONTROL_REG_CR0, val);
		break;
	default:
		panic ("Not implemented.");
	}
	add_ip ();
}

static void
do_cpuid (void)
{
	cpu_emul_cpuid ();
	add_ip ();
}

static void
do_msr_fault (void)
{
	struct vt_intr_data *vid = &current->u.vt.intr;

	if (vid->intr_info.s.valid == INTR_INFO_VALID_VALID) {
		panic ("Can't inject general protection fault (MSR) because the other event is already pending. "
		       "vector 0x%x type 0x%x, err_valid 0x%x",
					       vid->intr_info.s.vector,
					       vid->intr_info.s.type,
					       vid->intr_info.s.err);

	}
	vid->intr_info.v = 0;
	vid->intr_info.s.vector = EXCEPTION_GP;
	vid->intr_info.s.type = INTR_INFO_TYPE_HARD_EXCEPTION;
	vid->intr_info.s.err = INTR_INFO_ERR_VALID;
	vid->intr_info.s.valid = INTR_INFO_VALID_VALID;
	vid->exception_errcode = 0;
	vid->instruction_len = 0;
}

static void
do_rdmsr (void)
{
	if (cpu_emul_rdmsr ())
		do_msr_fault ();
	else
		add_ip ();
}

static void
do_wrmsr (void)
{
	if (cpu_emul_wrmsr ())
		do_msr_fault ();
	else
		add_ip ();
}

static void
do_exception (void)
{
	union {
		struct intr_info s;
		ulong v;
	} vii;
	ulong len;
	vmmerr_t err;
	ulong errc;

	asm_vmread (VMCS_VMEXIT_INTR_INFO, &vii.v);
	if (vii.s.valid != INTR_INFO_VALID_VALID) {
		return;
	}
	switch (vii.s.type) {
	case INTR_INFO_TYPE_HARD_EXCEPTION:
		STATUS_UPDATE (asm_lock_incl (&stat_hwexcnt));
		if (vii.s.vector == EXCEPTION_DB &&
		    current->u.vt.vr.sw.enable) {
			break;
		}
		if (vii.s.vector == EXCEPTION_PF) {
			ulong cr2;
			asm_vmread (VMCS_VMEXIT_INTR_ERRCODE, &errc);
			asm_vmread (VMCS_EXIT_QUALIFICATION, &cr2);

			if (current->u.vt.ept_enabled && current->u.vt.vr.pg) {
				/*
				 * page fault causes VM-exit
				 * even though EPT is enabled and page
				 * fault is masked.
				 */
				if (current->u.vt.intr.intr_info.s.valid
				    == INTR_INFO_VALID_VALID) {
					panic ("Can't inject a page fault because the other event is already pending. "
					       "vector 0x%x type 0x%x, err_valid 0x%x",
					       current->u.vt.intr.intr_info.s.vector,
					       current->u.vt.intr.intr_info.s.type,
					       current->u.vt.intr.intr_info.s.err);
				}
				current->u.vt.intr.intr_info.v = vii.v;
				current->u.vt.vr.cr2 = cr2;
				current->u.vt.intr.exception_errcode = errc;
				break;
			}

			cpu_mmu_spt_pagefault (errc, cr2);
			STATUS_UPDATE (asm_lock_incl (&stat_pfcnt));
		} else if (!current->u.vt.vr.pe) {
			/* Real mode */
			switch (vii.s.vector) {
			case EXCEPTION_GP:
				err = cpu_interpreter ();
				if (err == VMMERR_SUCCESS)
					break;
				panic ("General protection fault in real mode and can't emulate the instruction (err: %d)",
				       err);
			case EXCEPTION_DB:
				if (cpu_emul_realmode_int (1)) {
					panic ("cpu_emul_realmode_int (1) error");
				}
				break;
			default:
				panic ("Unimplemented exception in real mode: "
				       "0x%02X %s\n", vii.s.vector,
				       exception_name (vii.s.vector));
			}
		} else {
			if (current->u.vt.intr.intr_info.s.valid == INTR_INFO_VALID_VALID) {
				panic ("Can't inject a exception because the other event is already pending. "
				       "old: vector 0x%x type 0x%x, err_valid 0x%x "
				       "new: vector 0x%x type 0x%x, err_valid 0x%x",
				       current->u.vt.intr.intr_info.s.vector,
				       current->u.vt.intr.intr_info.s.type,
				       current->u.vt.intr.intr_info.s.err,
				       vii.s.vector, vii.s.type, vii.s.err);
			}
			if ((current->u.vt.intr.intr_info.v | 0x8000000)
			    == vii.v) {
				printf("DEBUG: Injecting the same exception twice.\n");
			}
			current->u.vt.intr.intr_info.v = vii.v;
			if (vii.s.err == INTR_INFO_ERR_VALID) {
				asm_vmread (VMCS_VMEXIT_INTR_ERRCODE,
					    &errc);
				current->u.vt.intr.exception_errcode = errc;
				if (vii.s.vector == EXCEPTION_GP) {
					panic ("DEBUG: General protection fault on CPU%d. error code 0x%lx\n", get_cpu_id(), errc);
				}
			}
			current->u.vt.intr.instruction_len = 0;
		}
		break;
	case INTR_INFO_TYPE_SOFT_EXCEPTION:
		if (current->u.vt.intr.intr_info.s.valid == INTR_INFO_VALID_VALID) {
			panic ("Can't inject a soft exception because the other event is already pending. "
			       "vector 0x%x type 0x%x, err_valid 0x%x",
			       current->u.vt.intr.intr_info.s.vector,
			       current->u.vt.intr.intr_info.s.type,
			       current->u.vt.intr.intr_info.s.err);
		}
		STATUS_UPDATE (asm_lock_incl (&stat_swexcnt));
		current->u.vt.intr.intr_info.v = vii.v;
		asm_vmread (VMCS_VMEXIT_INSTRUCTION_LEN, &len);
		current->u.vt.intr.instruction_len = len;
		break;
	case INTR_INFO_TYPE_NMI:
		panic("NMI recieved while executing guest software.");
		break;
	case INTR_INFO_TYPE_EXTERNAL:
	default:
		panic ("intr_info_type %d not implemented",
		       vii.s.type);
	}
}

static void
do_invlpg (void)
{
	ulong linear;

	asm_vmread (VMCS_EXIT_QUALIFICATION, &linear);
	cpu_mmu_spt_invalidate (linear);
	add_ip ();
}

void
vt_invlpg (ulong addr)
{
	cpu_mmu_spt_invalidate (addr);
}

/* VMCALL: guest calls VMM */
static void
do_vmcall (void)
{
	add_ip ();
	vmmcall ();
}


static void
vt_event_delivery_setup (void)
{
	struct vt_intr_data *vid = &current->u.vt.intr;

	if (vid->intr_info.s.valid == INTR_INFO_VALID_VALID) {
		asm_vmwrite (VMCS_VMENTRY_INTR_INFO_FIELD,
			     vid->intr_info.v);
		if (vid->intr_info.s.err == INTR_INFO_ERR_VALID)
			asm_vmwrite (VMCS_VMENTRY_EXCEPTION_ERRCODE,
				     vid->exception_errcode);
		asm_vmwrite (VMCS_VMENTRY_INSTRUCTION_LEN,
			     vid->instruction_len);
	}
}

static enum vt_status
call_vt_vmlaunch (void)
{
	if (asm_vmlaunch_regs (&current->u.vt.vr))
		return VT_VMENTRY_FAILED;
	return VT_VMEXIT;
}

static enum vt_status
call_vt_vmresume (void)
{
	if (asm_vmresume_regs (&current->u.vt.vr))
		return VT_VMENTRY_FAILED;
	return VT_VMEXIT;
}

static void
vt_vm_run (void)
{
	enum vt_status status;

	if (!current->u.vt.launched) {
		current->u.vt.launched = true;
		status = call_vt_vmlaunch ();
	} else {		
		status = call_vt_vmresume ();
	}
	if (status != VT_VMEXIT) {
		if (status == VT_VMENTRY_FAILED)
			panic ("VM entry failed.");
		else
			panic ("Strange status.");
	}
}

/* FIXME: bad handling of TF bit */
static void
vt_vm_run_with_tf (void)
{
	ulong rflags;

	vt_read_flags (&rflags);
	rflags |= RFLAGS_TF_BIT;
	vt_write_flags (rflags);
	vt_vm_run ();
	vt_read_flags (&rflags);
	rflags &= ~RFLAGS_TF_BIT;
	vt_write_flags (rflags);
}

static void
vt_event_delivery_check (void)
{
	union {
		struct intr_info s;
		ulong v;
	} ivif;
	struct vt_intr_data *vid = &current->u.vt.intr;

	if (vid->intr_info.s.valid == INTR_INFO_VALID_VALID) {
		/*
		 * Check if VM Exits occured during event delivery.
		 */
		asm_vmread (VMCS_IDT_VECTORING_INFO_FIELD, &ivif.v);
		if (ivif.s.valid == INTR_INFO_VALID_INVALID) {
			vid->intr_info.s.valid = INTR_INFO_VALID_INVALID;
		} else if (ivif.v != vid->intr_info.v) {
			panic (
			       "VMCS_IDT_VECTORING_INFO 0x%lX "
			       "is different from "
			       "VMCS_VMENTRY_INTR_INFO before VM-enter 0x%X\n",
			       ivif.v, vid->intr_info.v);
		}
	}
}

static void
do_hlt (void)
{
	cpu_emul_hlt ();
	add_ip ();
}

static void
task_switch_load_segdesc (u16 sel, ulong gdtr_base, ulong gdtr_limit,
			  ulong base, ulong limit, ulong acr)
{
	ulong addr, ldt_acr, desc_base, desc_limit;
	union {
		struct segdesc s;
		u64 v;
	} desc;
	vmmerr_t r;

	/* FIXME: set busy bit */
	if (sel == 0)
		return;
	if (sel & SEL_LDT_BIT) {
		asm_vmread (VMCS_GUEST_LDTR_ACCESS_RIGHTS, &ldt_acr);
		asm_vmread (VMCS_GUEST_LDTR_BASE, &desc_base);
		asm_vmread (VMCS_GUEST_LDTR_LIMIT, &desc_limit);
		if (ldt_acr & ACCESS_RIGHTS_UNUSABLE_BIT)
			panic ("loadseg: LDT unusable. sel=0x%X, idx=0x%lX\n",
			       sel, base);
		addr = sel & ~(SEL_LDT_BIT | SEL_PRIV_MASK);
	} else {
		desc_base = gdtr_base;
		desc_limit = gdtr_limit;
		addr = sel & ~(SEL_LDT_BIT | SEL_PRIV_MASK);
	}
	if ((addr | 7) > desc_limit)
		panic ("loadseg: limit check failed");
	addr += desc_base;
	r = read_linearaddr_q (addr, &desc.v);
	if (r != VMMERR_SUCCESS)
		panic ("loadseg: cannot read descriptor");
	if (desc.s.s == SEGDESC_S_CODE_OR_DATA_SEGMENT)
		desc.s.type |= 1; /* accessed bit */
	asm_vmwrite (acr, (desc.v >> 40) & ACCESS_RIGHTS_MASK);
	asm_vmwrite (base, SEGDESC_BASE (desc.s));
	asm_vmwrite (limit, ((desc.s.limit_15_0 | (desc.s.limit_19_16 << 16))
			     << (desc.s.g ? 12 : 0)) | (desc.s.g ? 0xFFF : 0));
}

static void
do_task_switch (void)
{
	vmmerr_t r;
	union {
		struct exit_qual_ts s;
		ulong v;
	} eqt;
	ulong tr_sel;
	ulong gdtr_base, gdtr_limit;
	union {
		struct segdesc s;
		u64 v;
	} tss1_desc, tss2_desc;
	struct tss32 tss32_1, tss32_2;
	ulong rflags, tmp;
	u16 tmp16;

	/* FIXME: 16bit TSS */
	/* FIXME: generate an exception if errors */
	/* FIXME: virtual 8086 mode */
	asm_vmread (VMCS_EXIT_QUALIFICATION, &eqt.v);
	asm_vmread (VMCS_GUEST_TR_SEL, &tr_sel);
	printf ("task switch from 0x%lX to 0x%X\n", tr_sel, eqt.s.sel);
	vt_read_gdtr (&gdtr_base, &gdtr_limit);
	r = read_linearaddr_q (gdtr_base + tr_sel, &tss1_desc.v);
	if (r != VMMERR_SUCCESS)
		goto err;
	r = read_linearaddr_q (gdtr_base + eqt.s.sel, &tss2_desc.v);
	if (r != VMMERR_SUCCESS)
		goto err;
	if (tss1_desc.s.type == SEGDESC_TYPE_16BIT_TSS_BUSY)
		panic ("task switch from 16bit TSS is not implemented.");
	if (tss1_desc.s.type != SEGDESC_TYPE_32BIT_TSS_BUSY)
		panic ("bad TSS descriptor 0x%llX", tss1_desc.v);
	if (eqt.s.src == EXIT_QUAL_TS_SRC_IRET ||
	    eqt.s.src == EXIT_QUAL_TS_SRC_JMP)
		tss1_desc.s.type = SEGDESC_TYPE_32BIT_TSS_AVAILABLE;
	if (eqt.s.src == EXIT_QUAL_TS_SRC_IRET) {
		if (tss2_desc.s.type == SEGDESC_TYPE_16BIT_TSS_BUSY)
			panic ("task switch to 16bit TSS is not implemented.");
		if (tss2_desc.s.type != SEGDESC_TYPE_32BIT_TSS_BUSY)
			panic ("bad TSS descriptor 0x%llX", tss1_desc.v);
	} else {
		if (tss2_desc.s.type == SEGDESC_TYPE_16BIT_TSS_AVAILABLE)
			panic ("task switch to 16bit TSS is not implemented.");
		if (tss2_desc.s.type != SEGDESC_TYPE_32BIT_TSS_AVAILABLE)
			panic ("bad TSS descriptor 0x%llX", tss1_desc.v);
		tss2_desc.s.type = SEGDESC_TYPE_32BIT_TSS_BUSY;
	}
	r = read_linearaddr_tss (SEGDESC_BASE (tss1_desc.s), &tss32_1,
				 sizeof tss32_1);
	if (r != VMMERR_SUCCESS)
		goto err;
	r = read_linearaddr_tss (SEGDESC_BASE (tss2_desc.s), &tss32_2,
				 sizeof tss32_2);
	if (r != VMMERR_SUCCESS)
		goto err;
	/* save old state */
	vt_read_flags (&rflags);
	if (eqt.s.src == EXIT_QUAL_TS_SRC_IRET)
		rflags &= ~RFLAGS_NT_BIT;
	vt_read_general_reg (GENERAL_REG_RAX, &tmp); tss32_1.eax = tmp;
	vt_read_general_reg (GENERAL_REG_RCX, &tmp); tss32_1.ecx = tmp;
	vt_read_general_reg (GENERAL_REG_RDX, &tmp); tss32_1.edx = tmp;
	vt_read_general_reg (GENERAL_REG_RBX, &tmp); tss32_1.ebx = tmp;
	vt_read_general_reg (GENERAL_REG_RSP, &tmp); tss32_1.esp = tmp;
	vt_read_general_reg (GENERAL_REG_RBP, &tmp); tss32_1.ebp = tmp;
	vt_read_general_reg (GENERAL_REG_RSI, &tmp); tss32_1.esi = tmp;
	vt_read_general_reg (GENERAL_REG_RDI, &tmp); tss32_1.edi = tmp;
	vt_read_sreg_sel (SREG_ES, &tmp16); tss32_1.es = tmp16;
	vt_read_sreg_sel (SREG_CS, &tmp16); tss32_1.cs = tmp16;
	vt_read_sreg_sel (SREG_SS, &tmp16); tss32_1.ss = tmp16;
	vt_read_sreg_sel (SREG_DS, &tmp16); tss32_1.ds = tmp16;
	vt_read_sreg_sel (SREG_FS, &tmp16); tss32_1.fs = tmp16;
	vt_read_sreg_sel (SREG_GS, &tmp16); tss32_1.gs = tmp16;
	tss32_1.eflags = rflags;
	vt_read_ip (&tmp); tss32_1.eip = tmp;
	r = write_linearaddr_q (gdtr_base + tr_sel, tss1_desc.v);
	if (r != VMMERR_SUCCESS)
		goto err;
	r = write_linearaddr_tss (SEGDESC_BASE (tss1_desc.s), &tss32_1,
				  sizeof tss32_1);
	if (r != VMMERR_SUCCESS)
		goto err;
	/* load new state */
	rflags = tss32_2.eflags;
	if (eqt.s.src == EXIT_QUAL_TS_SRC_CALL ||
	    eqt.s.src == EXIT_QUAL_TS_SRC_INTR) {
		rflags |= RFLAGS_NT_BIT;
		tss32_2.link = tr_sel;
	}
	rflags |= RFLAGS_ALWAYS1_BIT;
	vt_write_general_reg (GENERAL_REG_RAX, tss32_2.eax);
	vt_write_general_reg (GENERAL_REG_RCX, tss32_2.ecx);
	vt_write_general_reg (GENERAL_REG_RDX, tss32_2.edx);
	vt_write_general_reg (GENERAL_REG_RBX, tss32_2.ebx);
	vt_write_general_reg (GENERAL_REG_RSP, tss32_2.esp);
	vt_write_general_reg (GENERAL_REG_RBP, tss32_2.ebp);
	vt_write_general_reg (GENERAL_REG_RSI, tss32_2.esi);
	vt_write_general_reg (GENERAL_REG_RDI, tss32_2.edi);
	asm_vmwrite (VMCS_GUEST_ES_SEL, tss32_2.es);
	asm_vmwrite (VMCS_GUEST_CS_SEL, tss32_2.cs);
	asm_vmwrite (VMCS_GUEST_SS_SEL, tss32_2.ss);
	asm_vmwrite (VMCS_GUEST_DS_SEL, tss32_2.ds);
	asm_vmwrite (VMCS_GUEST_FS_SEL, tss32_2.fs);
	asm_vmwrite (VMCS_GUEST_GS_SEL, tss32_2.gs);
	asm_vmwrite (VMCS_GUEST_TR_SEL, eqt.s.sel);
	asm_vmwrite (VMCS_GUEST_LDTR_SEL, tss32_2.ldt);
	asm_vmwrite (VMCS_GUEST_ES_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_CS_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_SS_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_DS_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_FS_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_GS_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_TR_ACCESS_RIGHTS, ACCESS_RIGHTS_UNUSABLE_BIT);
	asm_vmwrite (VMCS_GUEST_LDTR_ACCESS_RIGHTS,
		     ACCESS_RIGHTS_UNUSABLE_BIT);
	vt_write_flags (rflags);
	vt_write_ip (tss32_2.eip);
	vt_write_control_reg (CONTROL_REG_CR3, tss32_2.cr3);
	r = write_linearaddr_q (gdtr_base + eqt.s.sel, tss2_desc.v);
	if (r != VMMERR_SUCCESS)
		goto err;
	r = write_linearaddr_tss (SEGDESC_BASE (tss2_desc.s), &tss32_2,
				  sizeof tss32_2);
	if (r != VMMERR_SUCCESS)
		goto err;
	/* load segment descriptors */
	if (rflags & RFLAGS_VM_BIT)
		panic ("switching to virtual 8086 mode");
	task_switch_load_segdesc (eqt.s.sel, gdtr_base, gdtr_limit,
				  VMCS_GUEST_TR_BASE, VMCS_GUEST_TR_LIMIT,
				  VMCS_GUEST_TR_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.ldt, gdtr_base, gdtr_limit,
				  VMCS_GUEST_LDTR_BASE, VMCS_GUEST_LDTR_LIMIT,
				  VMCS_GUEST_LDTR_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.es, gdtr_base, gdtr_limit,
				  VMCS_GUEST_ES_BASE, VMCS_GUEST_ES_LIMIT,
				  VMCS_GUEST_ES_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.cs, gdtr_base, gdtr_limit,
				  VMCS_GUEST_CS_BASE, VMCS_GUEST_CS_LIMIT,
				  VMCS_GUEST_CS_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.ss, gdtr_base, gdtr_limit,
				  VMCS_GUEST_SS_BASE, VMCS_GUEST_SS_LIMIT,
				  VMCS_GUEST_SS_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.ds, gdtr_base, gdtr_limit,
				  VMCS_GUEST_DS_BASE, VMCS_GUEST_DS_LIMIT,
				  VMCS_GUEST_DS_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.fs, gdtr_base, gdtr_limit,
				  VMCS_GUEST_FS_BASE, VMCS_GUEST_FS_LIMIT,
				  VMCS_GUEST_FS_ACCESS_RIGHTS);
	task_switch_load_segdesc (tss32_2.gs, gdtr_base, gdtr_limit,
				  VMCS_GUEST_GS_BASE, VMCS_GUEST_GS_LIMIT,
				  VMCS_GUEST_GS_ACCESS_RIGHTS);
	vt_read_control_reg (CONTROL_REG_CR0, &tmp);
	tmp |= CR0_TS_BIT;
	vt_write_control_reg (CONTROL_REG_CR0, tmp);
	return;
err:
	panic ("do_task_switch: error %d", r);
}

static void
vt_exit_reason (void)
{
	ulong exit_reason;

	asm_vmread (VMCS_EXIT_REASON, &exit_reason);
	if (exit_reason & EXIT_REASON_VMENTRY_FAILURE_BIT)
		panic ("VM Entry failure.");
	switch (exit_reason & EXIT_REASON_MASK) {
	case EXIT_REASON_MOV_CR:
		do_mov_cr ();
		break;
	case EXIT_REASON_CPUID:
		do_cpuid ();
		break;
	case EXIT_REASON_IO_INSTRUCTION:
		STATUS_UPDATE (asm_lock_incl (&stat_iocnt));
		vt_io ();
		break;
	case EXIT_REASON_RDMSR:
		do_rdmsr ();
		break;
	case EXIT_REASON_WRMSR:
		do_wrmsr ();
		break;
	case EXIT_REASON_EXCEPTION_OR_NMI:
		do_exception ();
		break;
	case EXIT_REASON_EXTERNAL_INT:
		STATUS_UPDATE (asm_lock_incl (&stat_intcnt));
		/*
		 * extint_check_interrupt will be inject the pending
		 * interrupt or keep it.
		 */
		break;
	case EXIT_REASON_INTERRUPT_WINDOW:
		/*
		 * extint_check_interrupt will be inject the pending
		 * interrupt.
		 */
		break;
	case EXIT_REASON_INVLPG:
		do_invlpg ();
		break;
	case EXIT_REASON_VMCALL: /* for debugging */
		do_vmcall ();
		break;
	case EXIT_REASON_HLT:
		STATUS_UPDATE (asm_lock_incl (&stat_hltcnt));
		do_hlt ();
		break;
	case EXIT_REASON_TASK_SWITCH:
		do_task_switch ();
		break;
	case EXIT_REASON_INIT_SIGNAL:
		/*
		 * The logical processor received INIT signal.
		 * Some controller (For example, ACPI controller) might
		 * asserted INIT signal.
		 * Reboot machine from CPU0.
		 */
		if (cpu_is_bsp()) {
			/*
			 * To reboot machine, we seem to need to
			 * execute vmxoff on CPU0.
			 */
			asm_vmxoff();
			apic_send_init(get_apic_id());
			freeze();
		} else {
			vcpu_reset();
			apic_enter_wait_for_sipi_state();
		}
		break;
	case EXIT_REASON_EPT_VIOLATION:
		vt_ept_violation();
		break;
	case EXIT_REASON_EPT_MISCONFIG:
		panic("EPT misconfig");
		break;

	default:
		panic ("Unhandled VM exit (0x%lx)", exit_reason);
	}
}

static void
vt_halt (void)
{
	struct {
		ulong sel;
		ulong acr;
		ulong limit;
	} es, cs, ss, ds, fs, gs;
	ulong rflags;

	asm_vmread (VMCS_GUEST_ES_SEL, &es.sel);
	asm_vmread (VMCS_GUEST_CS_SEL, &cs.sel);
	asm_vmread (VMCS_GUEST_SS_SEL, &ss.sel);
	asm_vmread (VMCS_GUEST_DS_SEL, &ds.sel);
	asm_vmread (VMCS_GUEST_FS_SEL, &fs.sel);
	asm_vmread (VMCS_GUEST_GS_SEL, &gs.sel);
	asm_vmread (VMCS_GUEST_ES_ACCESS_RIGHTS, &es.acr);
	asm_vmread (VMCS_GUEST_CS_ACCESS_RIGHTS, &cs.acr);
	asm_vmread (VMCS_GUEST_SS_ACCESS_RIGHTS, &ss.acr);
	asm_vmread (VMCS_GUEST_DS_ACCESS_RIGHTS, &ds.acr);
	asm_vmread (VMCS_GUEST_FS_ACCESS_RIGHTS, &fs.acr);
	asm_vmread (VMCS_GUEST_GS_ACCESS_RIGHTS, &gs.acr);
	asm_vmread (VMCS_GUEST_ES_LIMIT, &es.limit);
	asm_vmread (VMCS_GUEST_CS_LIMIT, &cs.limit);
	asm_vmread (VMCS_GUEST_SS_LIMIT, &ss.limit);
	asm_vmread (VMCS_GUEST_DS_LIMIT, &ds.limit);
	asm_vmread (VMCS_GUEST_FS_LIMIT, &fs.limit);
	asm_vmread (VMCS_GUEST_GS_LIMIT, &gs.limit);
	asm_vmread (VMCS_GUEST_RFLAGS, &rflags);
	asm_vmwrite (VMCS_GUEST_ES_SEL, 8);
	asm_vmwrite (VMCS_GUEST_CS_SEL, 8);
	asm_vmwrite (VMCS_GUEST_SS_SEL, 8);
	asm_vmwrite (VMCS_GUEST_DS_SEL, 8);
	asm_vmwrite (VMCS_GUEST_FS_SEL, 8);
	asm_vmwrite (VMCS_GUEST_GS_SEL, 8);
	asm_vmwrite (VMCS_GUEST_ES_ACCESS_RIGHTS, 0xC093);
	asm_vmwrite (VMCS_GUEST_CS_ACCESS_RIGHTS, 0xC09B);
	asm_vmwrite (VMCS_GUEST_SS_ACCESS_RIGHTS, 0xC093);
	asm_vmwrite (VMCS_GUEST_DS_ACCESS_RIGHTS, 0xC093);
	asm_vmwrite (VMCS_GUEST_FS_ACCESS_RIGHTS, 0xC093);
	asm_vmwrite (VMCS_GUEST_GS_ACCESS_RIGHTS, 0xC093);
	asm_vmwrite (VMCS_GUEST_ES_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_CS_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_SS_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_DS_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_FS_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_GS_LIMIT, 0xFFFFFFFF);
	asm_vmwrite (VMCS_GUEST_RFLAGS, RFLAGS_ALWAYS1_BIT | RFLAGS_IOPL_0 |
		(rflags & RFLAGS_IF_BIT));
	asm_vmwrite (VMCS_GUEST_ACTIVITY_STATE, VMCS_GUEST_ACTIVITY_STATE_HLT);
	vt_vm_run ();
	if (false) {		/* DEBUG */
		ulong exit_reason;

		asm_vmread (VMCS_EXIT_REASON, &exit_reason);
		if (exit_reason & EXIT_REASON_VMENTRY_FAILURE_BIT)
			panic ("HALT FAILED.");
	}
	asm_vmwrite (VMCS_GUEST_ACTIVITY_STATE,
		     VMCS_GUEST_ACTIVITY_STATE_ACTIVE);
	asm_vmwrite (VMCS_GUEST_ES_SEL, es.sel);
	asm_vmwrite (VMCS_GUEST_CS_SEL, cs.sel);
	asm_vmwrite (VMCS_GUEST_SS_SEL, ss.sel);
	asm_vmwrite (VMCS_GUEST_DS_SEL, ds.sel);
	asm_vmwrite (VMCS_GUEST_FS_SEL, fs.sel);
	asm_vmwrite (VMCS_GUEST_GS_SEL, gs.sel);
	asm_vmwrite (VMCS_GUEST_ES_ACCESS_RIGHTS, es.acr);
	asm_vmwrite (VMCS_GUEST_CS_ACCESS_RIGHTS, cs.acr);
	asm_vmwrite (VMCS_GUEST_SS_ACCESS_RIGHTS, ss.acr);
	asm_vmwrite (VMCS_GUEST_DS_ACCESS_RIGHTS, ds.acr);
	asm_vmwrite (VMCS_GUEST_FS_ACCESS_RIGHTS, fs.acr);
	asm_vmwrite (VMCS_GUEST_GS_ACCESS_RIGHTS, gs.acr);
	asm_vmwrite (VMCS_GUEST_ES_LIMIT, es.limit);
	asm_vmwrite (VMCS_GUEST_CS_LIMIT, cs.limit);
	asm_vmwrite (VMCS_GUEST_SS_LIMIT, ss.limit);
	asm_vmwrite (VMCS_GUEST_DS_LIMIT, ds.limit);
	asm_vmwrite (VMCS_GUEST_FS_LIMIT, fs.limit);
	asm_vmwrite (VMCS_GUEST_GS_LIMIT, gs.limit);
	asm_vmwrite (VMCS_GUEST_RFLAGS, rflags);
	vt_event_delivery_check ();
	vt_exit_reason ();
}

static void
vt_mainloop (void)
{
	vmmerr_t err;
	ulong cr0, acr;
	u64 efer;

	enable_interrupt();
	for (;;) {
		schedule ();
		apic_handle_wait_for_sipi_state();

		disable_interrupt();
		if (current->halt) {
			vt_halt();
			current->halt = false;
			enable_interrupt();
			continue;
		}
		enable_interrupt();

		/* when the state is switching between real mode and
		   protected mode, we try emulation first */
		/* SWITCHING:
		   mov  %cr0,%eax
		   or   $CR0_PE_BIT,%eax
		   mov  %eax,%cr0
		   ljmp $0x8,$1f       | SWITCHING STATE
		   1:                  |
		   mov  $0x10,%eax     | segment registers hold the contents
		   mov  %eax,%ds       | in previous mode. we use interpreter
		   mov  %eax,%es       | to emulate this situation.
		   mov  %eax,%fs       | maximum 32 instructions are emulated
		   mov  %eax,%gs       | because the interpreter is too slow.
		   mov  %eax,%ss       |
		   ...
		 */
		if (current->u.vt.vr.sw.enable) {
			current->u.vt.vr.sw.num++;
			if (current->u.vt.vr.sw.num >= 32) {
				/* consider emulation is not needed after
				   32 instructions are executed */
				current->u.vt.vr.sw.enable = 0;
				continue;
			}
			vt_read_control_reg (CONTROL_REG_CR0, &cr0);
			if (cr0 & CR0_PG_BIT) {
				vt_read_msr (MSR_IA32_EFER, &efer);
				if (efer & MSR_IA32_EFER_LME_BIT) {
					vt_read_sreg_acr (SREG_CS, &acr);
					if (acr & ACCESS_RIGHTS_L_BIT) {
						/* long mode */
						current->u.vt.vr.sw.enable = 0;
						continue;
					}
				}
			}
			err = cpu_interpreter ();
			if (err == VMMERR_SUCCESS) /* emulation successful */
				continue;
			else if (err == VMMERR_UNSUPPORTED_OPCODE ||
				 err == VMMERR_SW)
				; /* unsupported/run as it is */
			else	/* failed */
				panic ("vt_mainloop ERR %d", err);
			/* continue when the instruction is not supported
			   or should be executed as it is.
			   (sw.enable may be changed after cpu_interpreter())
			*/
		}
		disable_interrupt();
		extint_check_interrupt ();
		vt_event_delivery_setup ();
		if (current->u.vt.vr.sw.enable) {
			/* when the state is switching, do single step */
			vt_vm_run_with_tf ();
		} else {
			/* not switching */
			vt_vm_run ();
		}
		vt_event_delivery_check ();
		vt_exit_reason ();
		enable_interrupt();
	}
}

static char *
vt_status (void)
{
	static char buf[1024];

	snprintf (buf, 1024,
		  "Interrupts: %u\n"
		  "Hardware exceptions: %u\n"
		  " Page fault: %u\n"
		  " Others: %u\n"
		  "Software exception: %u\n"
		  "Watched I/O: %u\n"
		  "Halt: %u\n"
		  , stat_intcnt, stat_hwexcnt, stat_pfcnt
		  , stat_hwexcnt - stat_pfcnt, stat_swexcnt
		  , stat_iocnt, stat_hltcnt);
	return buf;
}

static void
vt_register_status_callback (void)
{
	register_status_callback (vt_status);
}

void
vt_run_vm (void)
{
	vt_mainloop ();
}

INITFUNC ("global4", vt_register_status_callback);
