How are Sentry system calls trapped into host kernel?
From How gvisor trap to syscall handler in kvm platform, “Note that the SYSCALL instruction (Wenbo: in sentry guest ring 0) works just fine from ring 0, it just doesn’t perform a ring switch since you’re already in ring 0 (guest). Yup, the syscall handler executes a HLT, which is the trigger to switch back to host mode. To see the host/guest transition internals take a look at bluepill() (switch to guest mode) and redpill() (switch to host mode) in platform/kvm. The control flow is bit hard to follow. At a high level it goes: bluepill() -> execute CLI (allowed if already in guest mode, or …) -> SIGILL signal handler -> bluepillHandler() -> KVM_RUN with RIP @ CLI instruction -> execute CLI in guest mode, bluepill() returns”
/dev/kvm
// OpenDevice opens the KVM device at /dev/kvm and returns the File. func OpenDevice() (*os.File, error) { f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0) if err != nil { return nil, fmt.Errorf("error opening /dev/kvm: %v", err) } return f, nil } type constructor struct{} func (*constructor) New(f *os.File) (platform.Platform, error) { return New(f) } func (*constructor) OpenDevice() (*os.File, error) { return OpenDevice() }
创建vcpu
int vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); struct kvm_sregs sregs; ioctl(vcpu_fd, KVM_GET_SREGS, &sregs); // Initialize selector and base with zeros sregs.cs.selector = sregs.cs.base = sregs.ss.selector = sregs.ss.base = sregs.ds.selector = sregs.ds.base = sregs.es.selector = sregs.es.base = sregs.fs.selector = sregs.fs.base = sregs.gs.selector = 0; // Save special registers ioctl(vcpu_fd, KVM_SET_SREGS, &sregs); // Initialize and save normal registers struct kvm_regs regs; regs.rflags = 2; // bit 1 must always be set to 1 in EFLAGS and RFLAGS regs.rip = 0; // our code runs from address 0 ioctl(vcpu_fd, KVM_SET_REGS, ®s); int runsz = ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0); struct kvm_run *run = (struct kvm_run *) mmap(NULL, runsz, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_fd, 0); for (;;) { ioctl(vcpu_fd, KVM_RUN, 0); switch (run->exit_reason) { case KVM_EXIT_IO: printf("IO port: %x, data: %x\n", run->io.port, *(int *)((char *)(run) + run->io.data_offset)); break; case KVM_EXIT_SHUTDOWN: return; } }
gvisor
// Precondition: mu must be held. func (m *machine) newVCPU() *vCPU { // Create the vCPU. id := int(atomic.AddUint32(&m.nextID, 1) - 1) fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) } func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) // Mark this as guest mode. switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) { case vCPUUser: // Expected case. case vCPUUser | vCPUWaiter: c.notify() default: throw("invalid state") } for { _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) // escapes: no. switch errno { } } }