深入理解Linux系统调用

2023-01-05 16:24:06

深入理解Linux系统调用

实验要求

找一个系统调用，系统调用号为学号最后2位相同的系统调用；
通过汇编指令触发该系统调用；
通过gdb跟踪该系统调用的内核处理过程；
重点阅读分析系统调用入口的保存现场、恢复现场和系统调用返回，以及重点关注系统调用过程中内核堆栈状态的变化。

环境准备

下载Linux内核源码并配置QMenu虚拟环境

配置内核选项，并编译

make defconfig #Default configuration is based on 'x86_64_defconfig'
make menuconfig
#打开debug相关选项
Kernel hacking --->
    Compile-time checks and compiler options --->
        [*] Compile the kernel with debug info
        [*] Provide GDB scripts for kernel debugging [*] Kernel debugging
#关闭KASLR，否则会导致打断点失败
Processor type and features ---->
    [] Randomize the address of the kernel image (KASLR)
# 配置完成后进行编译    
make -j$(nproc) # nproc gives the number of CPU cores/threads　available
# 测试一下内核能不能正常加载运行,因为没有文件系统最终会kernel　panic
qemu-system-x86_64 -kernel arch/x86/boot/bzImage

制作根文件系统

# 下载busybox
axel -n 20 https://busybox.net/downloads/busybox-1.31.1.tar.bz2
tar -jxvf busybox-1.31.1.tar.bz2
cd busybox-1.31.1
# 设置静态链接库编译
make menuconfig
Settings --->
    [*] Build static binary (no shared libs)
# 编译并安装
make -j$(nproc) && make install
# 创建相关目录及文件
mkdir rootfs
cd rootfs
cp ../busybox-1.31.1/_install/* ./ -rf
mkdir dev proc sys home
sudo cp -a /dev/{null,console,tty,tty1,tty2,tty3,tty4} dev/

准备init脚本并放在rootfs/init目录下

#!/bin/sh
mount -t proc none /proc
mount -t sysfs none /sys
echo "-------------------"
echo "--------------------"
cd home
/bin/sh

给init脚本添加可执行权限

chmod +x init

打包成内存根文件系统镜像

find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz

启动测试是否执行成功

qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage　-initrd rootfs.cpio.gz

确定系统调用

在arch/x86/entry/syscalls/syscall_64.tbl中，查找尾号为73的系统调用

73	common	flock			__x64_sys_flock
173	common	ioperm			__x64_sys_ioperm
273	64	set_robust_list		__x64_sys_set_robust_list

这里选取73号系统调用flock进行分析

(注：关于flock系统调用的使用说明)

通过汇编触发系统调用

先编译一个简单的C语言文件执行系统调用

#include<stdio.h>
#include<sys/file.h>
# include <fcntl.h>
#include <unistd.h>
int main(void){
    int fd;
    fd=open("test.txt",O_WRONLY|O_CREAT);
    printf("The fd value is : %d \n", fd);
    int ret = flock(fd,1);
    printf("The return value is : %d \n", ret);
    close(fd);
    return 0;
}

使用一下命令进行静态编译，并反汇编：

gcc -o flock flock.c -static
objdump -S flock > flock64.S

查看main的汇编代码

可以看到，第一个参数通过寄存器eax移动到edi中，第二个参数0x1移动到esi中，然后执行系统调用flock.

查看flock的汇编代码

通过查看flock系统调用的汇编代码，我们可以自己写出相应的汇编代码：

#include<stdio.h>
#include<sys/file.h>
# include <fcntl.h>
#include <unistd.h>
int main(void){
    int fd;
    int ret;
    fd=open("test.txt",O_WRONLY|O_CREAT);
    asm volatile(
        "movl $0x1, %%esi\n\t"    //esi寄存器用于传递参数
         "movl $0x3, %%edi\n\t" //edi寄存器用于传递参数
        "mov $0x49, %%eax\n\t" //eax寄存器用于传递系统调用号
        "syscall\n\t"
        "movq %%rax,%0\n\t"      //保存返回值
        :"=m"(ret)
    );
    printf("The return value is : %d \n", ret);
    close(fd);
    return 0;
}

输出结果如下(注意需要静态编译)：

返回值为0，说明系统调用成功。

gdb追踪系统调用

将刚才编译的文件复制到rootfs/syscall目录下，重新生成根文件系统

find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz

启动qemu，为了可以使用gdb server进行调试，这里加了两个参数，一个是-s,在TCP 1234端口上创建了一个gdb-server。可以另外打开一个窗口,用gdb把带有符号表的内核镜像vmlinux加载进来,然后连接gdb server,设置断点跟踪内核。若不想使用1234端口,可以使用-gdb tcp:xxxx来替代-s选项),另一个是-S代表启动时暂停虚拟机,等待 gdb 执行 continue指令。

qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage -initrd rootfs.cpio.gz -S -s -nographic -append "console=ttyS0"

打开目录linux-5.4.34，启动gdb

gdb vmlinux

建立连接：

target remote:1234

在linux-5.4.34\arch\x86\entry\syscalls\syscall_64.tbl下找到对应函数名

73	common	flock			__x64_sys_flock

在gdb中设置断点

b __x64_sys_flock

查看系统调用栈

可以看到系统调用的入口在entry_SYSCALL_64()

找到该处的代码

ENTRY(entry_SYSCALL_64)
	UNWIND_HINT_EMPTY
	/*
	 * Interrupts are off on entry.
	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
	 * it is too small to ever cause noticeable irq latency.
	 */

	swapgs
	/* tss.sp2 is scratch space. */
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

	/* Construct struct pt_regs on stack */
	pushq	$__USER_DS				/* pt_regs->ss */
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
	pushq	%r11					/* pt_regs->flags */
	pushq	$__USER_CS				/* pt_regs->cs */
	pushq	%rcx					/* pt_regs->ip */

swapgs指令以类似快照的方式通过CPU内部的存储器，将保存现场和恢复现场时的寄存器保存起来，然后将pt_regs中的相关字段保存到内核栈中。
紧接着，调用了do_syscall_64，代码如下

GLOBAL(entry_SYSCALL_64_after_hwframe)
	pushq	%rax					/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	TRACE_IRQS_OFF

	/* IRQs are off. */
	movq	%rax, %rdi
	movq	%rsp, %rsi
	call	do_syscall_64		/* returns with IRQs disabled */

先将rax中的值保存在了栈中，然后通过rdi,rsi进行传参，其中rdi传递的是系统调用号，rsi传递的是pt_regs
函数do_syscall_64()的代码如下

#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
	struct thread_info *ti;

	enter_from_user_mode();
	local_irq_enable();
	ti = current_thread_info();
	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
		nr = syscall_trace_enter(regs);

	if (likely(nr < NR_syscalls)) {
		nr = array_index_nospec(nr, NR_syscalls);
		regs->ax = sys_call_table[nr](regs);
#ifdef CONFIG_X86_X32_ABI
	} else if (likely((nr & __X32_SYSCALL_BIT) &&
			  (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
		nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
					X32_NR_syscalls);
		regs->ax = x32_sys_call_table[nr](regs);
#endif
	}

	syscall_return_slowpath(regs);
}

在该函数中，通过传入的系统调用号nr找到相应的系统调用，并将返回值保存在regs的ax中。
调用结束后，执行syscall_return_slowpath，进行返回。
然后在gdb单步调试中，我们可以看到从syscall_return_slowpath返回后，开始恢复现场。主要是将之前保存在栈中的寄存器的值，重新恢复到原来的寄存器中。

码农公寓

深入理解Linux系统调用

实验要求

环境准备

确定系统调用

通过汇编触发系统调用

gdb追踪系统调用

相关文章