gvisor vcpu

 

How are Sentry system calls trapped into host kernel?
From How gvisor trap to syscall handler in kvm platform, “Note that the SYSCALL instruction (Wenbo: in sentry guest ring 0) works just fine from ring 0, it just doesn’t perform a ring switch since you’re already in ring 0 (guest). Yup, the syscall handler executes a HLT, which is the trigger to switch back to host mode. To see the host/guest transition internals take a look at bluepill() (switch to guest mode) and redpill() (switch to host mode) in platform/kvm. The control flow is bit hard to follow. At a high level it goes: bluepill() -> execute CLI (allowed if already in guest mode, or …) -> SIGILL signal handler -> bluepillHandler() -> KVM_RUN with RIP @ CLI instruction -> execute CLI in guest mode, bluepill() returns”

 

/dev/kvm

// OpenDevice opens the KVM device at /dev/kvm and returns the File.
func OpenDevice() (*os.File, error) {
f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0)
if err != nil {
return nil, fmt.Errorf("error opening /dev/kvm: %v", err)
}
return f, nil
}


type constructor struct{}

func (*constructor) New(f *os.File) (platform.Platform, error) {
return New(f)
}

func (*constructor) OpenDevice() (*os.File, error) {
return OpenDevice()
}

 

 

创建vcpu

int vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
struct kvm_sregs sregs;
ioctl(vcpu_fd, KVM_GET_SREGS, &sregs);
// Initialize selector and base with zeros
sregs.cs.selector = sregs.cs.base = sregs.ss.selector = sregs.ss.base = sregs.ds.selector = sregs.ds.base = sregs.es.selector = sregs.es.base = sregs.fs.selector = sregs.fs.base = sregs.gs.selector = 0;
// Save special registers
ioctl(vcpu_fd, KVM_SET_SREGS, &sregs);

// Initialize and save normal registers
struct kvm_regs regs;
regs.rflags = 2; // bit 1 must always be set to 1 in EFLAGS and RFLAGS
regs.rip = 0; // our code runs from address 0
ioctl(vcpu_fd, KVM_SET_REGS, &regs);
int runsz = ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
struct kvm_run *run = (struct kvm_run *) mmap(NULL, runsz, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_fd, 0);

for (;;) {
ioctl(vcpu_fd, KVM_RUN, 0);
switch (run->exit_reason) {
case KVM_EXIT_IO:
printf("IO port: %x, data: %x\n", run->io.port, *(int *)((char *)(run) + run->io.data_offset));
break;
case KVM_EXIT_SHUTDOWN:
return;
}
}

 

gvisor

// Precondition: mu must be held.
func (m *machine) newVCPU() *vCPU {
// Create the vCPU.
id := int(atomic.AddUint32(&m.nextID, 1) - 1)
fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
}

func bluepillHandler(context unsafe.Pointer) {
// Sanitize the registers; interrupts must always be disabled.
c := bluepillArchEnter(bluepillArchContext(context))

// Mark this as guest mode.
switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
case vCPUUser: // Expected case.
case vCPUUser | vCPUWaiter:
c.notify()
default:
throw("invalid state")
}

for {
_, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no.
switch errno {
}
}
}

 

上一篇:databinding双向绑定原理,6年菜鸟开发面试字节跳动Android研发岗,你掌握了多少?


下一篇:谷歌新作gVisor:VM容器融合技术已经到来