深入理解libaio 接口

也许大家都会使用libaio接口,但它和内核是如何交互的呢?内核的机制又是怎样的呢?下面就一起跟踪下主要的流程。

登堂入室:系统调用

依赖的头文件

#include <errno.h>
#include <sys/syscall.h>
#include <unistd.h>

主要的函数:

/* Actual syscalls */
int io_setup(int maxevents, io_context_t *ctxp) {
    return syscall(__NR_io_setup, maxevents, ctxp);
}

int io_destroy(io_context_t ctx) {
    return syscall(__NR_io_destroy, ctx);
}

int io_submit(io_context_t ctx, long nr, struct iocb *ios[]) {
    return syscall(__NR_io_submit, ctx, nr, ios);
}

int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt) {
    return syscall(__NR_io_cancel, ctx, iocb, evt);
}

(待跟踪问题:io_get_events() 又是怎样实现的呢?)

系统调用的实现

那么上面的系统调用号又具体对应到什么代码呢?让我们先看看系统调用相关的背景知识:

http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>;

The x86 architecture has quite a few different ways to jump into
kernel code. Most of these entry points are registered in
arch/x86/kernel/traps.c and implemented in arch/x86/entry/entry_64.S
for 64-bit, arch/x86/entry/entry_32.S for 32-bit and finally
arch/x86/entry/entry_64_compat.S which implements the 32-bit compatibility
syscall entry points and thus provides for 32-bit processes the
ability to execute syscalls when running on 64-bit kernels.

The IDT vector assignments are listed in arch/x86/include/asm/irq_vectors.h.

Some of these entries are:

  • system_call: syscall instruction from 64-bit code.

  • entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall
    either way.

  • entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit
    code

  • interrupt: An array of entries. Every IDT vector that doesn't
    explicitly point somewhere else gets set to the corresponding
    value in interrupts. These point to a whole array of
    magically-generated functions that make their way to do_IRQ with
    the interrupt number as a parameter.

  • APIC interrupts: Various special-purpose interrupts for things
    like TLB shootdown.

  • Architecturally-defined exceptions like divide_error.

    接着看看异步IO系统调用涉及到的一些系统调用编号,参考arch/ia64/include/uapi/asm/unistd.h文件:

    define __NR_io_setup                   1238
    #define __NR_io_destroy                 1239
    #define __NR_io_getevents               1240
    #define __NR_io_submit                  1241
    #define __NR_io_cancel                  1242
    #define __NR_epoll_create               1243
    #define __NR_epoll_ctl                  1244
    #define __NR_epoll_wait                 1245
    

那么这个系统调用编号又是怎样和功能代码相管理的呢?

通用入口

ENTRY(entry_SYSCALL_64)
        UNWIND_HINT_EMPTY
        /*
         * Interrupts are off on entry.
         * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
         * it is too small to ever cause noticeable irq latency.
         */

        swapgs
        /*
         * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
         * is not required to switch CR3.
         */
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS                      /* pt_regs->ss */
        pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
        pushq   %r11                            /* pt_regs->flags */
        pushq   $__USER_CS                      /* pt_regs->cs */
        pushq   %rcx                            /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
        pushq   %rax                            /* pt_regs->orig_ax */

        PUSH_AND_CLEAR_REGS rax=$-ENOSYS

        TRACE_IRQS_OFF

        /* IRQs are off. */
        movq    %rax, %rdi
        movq    %rsp, %rsi
        call    do_syscall_64           /* returns with IRQs disabled */

        TRACE_IRQS_IRETQ                /* we're about to change IF */

基于系统调用表找到处理函数

#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
        struct thread_info *ti;

        enter_from_user_mode();
        local_irq_enable();
        ti = current_thread_info();
        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
                nr = syscall_trace_enter(regs);

        /*
         * NB: Native and x32 syscalls are dispatched from the same
         * table.  The only functional difference is the x32 bit in
         * regs->orig_ax, which changes the behavior of some syscalls.
         */
        nr &= __SYSCALL_MASK;
        if (likely(nr < NR_syscalls)) {
                nr = array_index_nospec(nr, NR_syscalls);
                regs->ax = sys_call_table[nr](regs);
        }

        syscall_return_slowpath(regs);
}
#endif

sys_call_table 在哪?在include/uapi/asm-generic/unistd.h

#define __NR_io_setup 0
__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
#define __NR_io_destroy 1
__SYSCALL(__NR_io_destroy, sys_io_destroy)
#define __NR_io_submit 2
__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
#define __NR_io_cancel 3
__SYSCALL(__NR_io_cancel, sys_io_cancel)
#define __NR_io_getevents 4
__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)

调用真正干活的函数:fs/aio.c: sys_io_submit

/* sys_io_submit:
 *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
 *      the number of iocbs queued.  May return -EINVAL if the aio_context
 *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
 *      *iocbpp[0] is not properly initialized, if the operation specified
 *      is invalid for the file descriptor in the iocb.  May fail with
 *      -EFAULT if any of the data structures point to invalid data.  May
 *      fail with -EBADF if the file descriptor specified in the first
 *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
 *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
 *      fail with -ENOSYS if not implemented.
 */
SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                struct iocb __user * __user *, iocbpp)
{
        struct kioctx *ctx;
        long ret = 0;
        int i = 0;
        struct blk_plug plug;

        if (unlikely(nr < 0))
                return -EINVAL;

        ctx = lookup_ioctx(ctx_id);
        if (unlikely(!ctx)) {
                pr_debug("EINVAL: invalid context id\n");
                return -EINVAL;
        }

        if (nr > ctx->nr_events)
                nr = ctx->nr_events;

        blk_start_plug(&plug);
        for (i = 0; i < nr; i++) {
                struct iocb __user *user_iocb;

                if (unlikely(get_user(user_iocb, iocbpp + i))) {
                        ret = -EFAULT;
                        break;
                }

                ret = io_submit_one(ctx, user_iocb, false);
                if (ret)
                        break;
        }

参考文档:

Documentation/process/adding-syscalls.rst

上一篇:深入理解Linux系统调用


下一篇:深入理解系统调用