也许大家都会使用libaio接口,但它和内核是如何交互的呢?内核的机制又是怎样的呢?下面就一起跟踪下主要的流程。
登堂入室:系统调用
依赖的头文件
#include <errno.h>
#include <sys/syscall.h>
#include <unistd.h>
主要的函数:
/* Actual syscalls */
int io_setup(int maxevents, io_context_t *ctxp) {
return syscall(__NR_io_setup, maxevents, ctxp);
}
int io_destroy(io_context_t ctx) {
return syscall(__NR_io_destroy, ctx);
}
int io_submit(io_context_t ctx, long nr, struct iocb *ios[]) {
return syscall(__NR_io_submit, ctx, nr, ios);
}
int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt) {
return syscall(__NR_io_cancel, ctx, iocb, evt);
}
(待跟踪问题:io_get_events() 又是怎样实现的呢?)
系统调用的实现
那么上面的系统调用号又具体对应到什么代码呢?让我们先看看系统调用相关的背景知识:
http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>;
The x86 architecture has quite a few different ways to jump into
kernel code. Most of these entry points are registered in
arch/x86/kernel/traps.c and implemented in arch/x86/entry/entry_64.S
for 64-bit, arch/x86/entry/entry_32.S for 32-bit and finally
arch/x86/entry/entry_64_compat.S which implements the 32-bit compatibility
syscall entry points and thus provides for 32-bit processes the
ability to execute syscalls when running on 64-bit kernels.
The IDT vector assignments are listed in arch/x86/include/asm/irq_vectors.h.
Some of these entries are:
-
system_call: syscall instruction from 64-bit code.
-
entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall
either way. -
entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit
code -
interrupt: An array of entries. Every IDT vector that doesn't
explicitly point somewhere else gets set to the corresponding
value in interrupts. These point to a whole array of
magically-generated functions that make their way to do_IRQ with
the interrupt number as a parameter. -
APIC interrupts: Various special-purpose interrupts for things
like TLB shootdown. -
Architecturally-defined exceptions like divide_error.
接着看看异步IO系统调用涉及到的一些系统调用编号,参考arch/ia64/include/uapi/asm/unistd.h文件:
define __NR_io_setup 1238 #define __NR_io_destroy 1239 #define __NR_io_getevents 1240 #define __NR_io_submit 1241 #define __NR_io_cancel 1242 #define __NR_epoll_create 1243 #define __NR_epoll_ctl 1244 #define __NR_epoll_wait 1245
那么这个系统调用编号又是怎样和功能代码相管理的呢?
通用入口
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
swapgs
/*
* This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
TRACE_IRQS_OFF
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */
TRACE_IRQS_IRETQ /* we're about to change IF */
基于系统调用表找到处理函数
#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
enter_from_user_mode();
local_irq_enable();
ti = current_thread_info();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs);
/*
* NB: Native and x32 syscalls are dispatched from the same
* table. The only functional difference is the x32 bit in
* regs->orig_ax, which changes the behavior of some syscalls.
*/
nr &= __SYSCALL_MASK;
if (likely(nr < NR_syscalls)) {
nr = array_index_nospec(nr, NR_syscalls);
regs->ax = sys_call_table[nr](regs);
}
syscall_return_slowpath(regs);
}
#endif
sys_call_table 在哪?在include/uapi/asm-generic/unistd.h
#define __NR_io_setup 0
__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
#define __NR_io_destroy 1
__SYSCALL(__NR_io_destroy, sys_io_destroy)
#define __NR_io_submit 2
__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
#define __NR_io_cancel 3
__SYSCALL(__NR_io_cancel, sys_io_cancel)
#define __NR_io_getevents 4
__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
调用真正干活的函数:fs/aio.c: sys_io_submit
/* sys_io_submit:
* Queue the nr iocbs pointed to by iocbpp for processing. Returns
* the number of iocbs queued. May return -EINVAL if the aio_context
* specified by ctx_id is invalid, if nr is < 0, if the iocb at
* *iocbpp[0] is not properly initialized, if the operation specified
* is invalid for the file descriptor in the iocb. May fail with
* -EFAULT if any of the data structures point to invalid data. May
* fail with -EBADF if the file descriptor specified in the first
* iocb is invalid. May fail with -EAGAIN if insufficient resources
* are available to queue any iocbs. Will return 0 if nr is 0. Will
* fail with -ENOSYS if not implemented.
*/
SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
struct iocb __user * __user *, iocbpp)
{
struct kioctx *ctx;
long ret = 0;
int i = 0;
struct blk_plug plug;
if (unlikely(nr < 0))
return -EINVAL;
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
if (nr > ctx->nr_events)
nr = ctx->nr_events;
blk_start_plug(&plug);
for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
if (unlikely(get_user(user_iocb, iocbpp + i))) {
ret = -EFAULT;
break;
}
ret = io_submit_one(ctx, user_iocb, false);
if (ret)
break;
}
参考文档:
Documentation/process/adding-syscalls.rst