深入理解TCP协议及其源代码

一、TCP概述

简介:

  传输控制协议(TCP是一种面向连接的、可靠的、基于字节流传输层通信协议,由IETFRFC 793定义。在简化的计算机网络OSI模型中,它完成第四层传输层所指定的功能。用户数据报协议(UDP)是同一层内另一个重要的传输协议。

  应用层向TCP层发送用于网间传输的、用8位字节表示的数据流,然后TCP把数据流分割成适当长度的报文段(通常受该计算机连接的网络的数据链路层的最大传输单元(MTU)的限制)。之后TCP把结果包传给IP层,由它来透过网络将包传送给接收端实体的TCP层。TCP为了保证不发生丢包,就给每个包一个序号,同时序号也保证了传送到接收端实体的包的按序接收。然后接收端实体对已成功收到的包发回一个相应的确认信息(ACK);如果发送端实体在合理的往返时延(RTT)内未收到确认,那么对应的数据包就被假设为已丢失并进行重传。TCP用一个校验和函数来检验数据是否有错误,在发送和接收时都要计算校验和。

报文格式:

深入理解TCP协议及其源代码

序号:Seq序号,占32位,用来标识从TCP源端向目的端发送的字节流,发起方发送数据时对此进行标记。
        (2)确认序号:Ack序号,占32位,只有ACK标志位为1时,确认序号字段才有效,Ack=Seq+1。
        (3)标志位:共6个,即URG、ACK、PSH、RST、SYN、FIN等,

运作方式:

  TCP协议的运行可划分为三个阶段:连接创建(connection establishment)、数据传送(data transfer)和连接终止(connection termination)。操作系统将TCP连接抽象为套接字表示的本地端点(local end-point),作为编程接口给程序使用。在TCP连接的生命期内,本地端点要经历一系列的状态改变。

二、TCP的三次握手

三次握手(Three-Way Handshake)即建立TCP连接,就是指建立一个TCP连接时,需要客户端和服务端总共发送3个包以确认连接的建立。

深入理解TCP协议及其源代码

第一次握手:Client将标志位SYN置为1,随机产生一个值seq=J,并将该数据包发送给Server,Client进入SYN_SENT状态,等待Server确认。
第二次握手:Server收到数据包后由标志位SYN=1知道Client请求建立连接,Server将标志位SYN和ACK都置为1,ack=J+1,随机产生一个值seq=K,并将该数据包发送给Client以确认连接请求,Server进入SYN_RCVD状态。
 第三次握手:Client收到确认后,检查ack是否为J+1,ACK是否为1,如果正确则将标志位ACK置为1,ack=K+1,并将该数据包发送给Server,Server检查ack是否为K+1,ACK是否为1,如果正确则连接建立成功,Client和Server进入ESTABLISHED状态,完成三次握手,随后Client与Server之间可以开始传输数据了。

系统调用是调用socket相关系统调用的入口函数

1.connect

当客户端调用系统调用connect连接服务端的时候,经过socket系统调用入口函数的分发,最终调用的是__sys_connect()函数.

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;
 
    //根据fd,找到对应的socket。11     sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (!sock)
        goto out;
 
16     err = move_addr_to_kernel(uservaddr, addrlen, &address);
    if (err < 0)
        goto out_put;
 
    err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
    if (err)
        goto out_put;
 
    // 调用sock的连接函数27     err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags);
 
out_put:
    fput_light(sock->file, fput_needed);
 
out:
    return err;
}

sockfd_lookup_light()找到具体的socket实例。其中调用了__inet_stream_connect函数。

int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
              int addr_len, int flags, int is_sendmsg)
{
    struct sock *sk = sock->sk;
    int err;
    long timeo;

    /*
     * uaddr can be NULL and addr_len can be 0 if:
     * sk is a TCP fastopen active socket and
     * TCP_FASTOPEN_CONNECT sockopt is set and
     * we already have a valid cookie for this socket.
     * In this case, user can call write() after connect().
     * write() will invoke tcp_sendmsg_fastopen() which calls
     * __inet_stream_connect().
     */
    if (uaddr) {
        if (addr_len < sizeof(uaddr->sa_family))
            return -EINVAL;

        if (uaddr->sa_family == AF_UNSPEC) {
            err = sk->sk_prot->disconnect(sk, flags);
            sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
            goto out;
        }
    }

    switch (sock->state) {
    default:
        err = -EINVAL;
        goto out;
    case SS_CONNECTED:
        err = -EISCONN;
        goto out;
    case SS_CONNECTING:
        if (inet_sk(sk)->defer_connect)
            err = is_sendmsg ? -EINPROGRESS : -EISCONN;
        else
            err = -EALREADY;
        /* Fall out of switch with err, set for this state */
        break;
    case SS_UNCONNECTED:    //没有建立连接
        err = -EISCONN;
        if (sk->sk_state != TCP_CLOSE)
            goto out;

        if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
            err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
            if (err)
                goto out;
        }

        err = sk->sk_prot->connect(sk, uaddr, addr_len);  //这一行发送一个报文,这是TCP连接的第一步
        if (err < 0)
            goto out;

        sock->state = SS_CONNECTING;

        if (!err && inet_sk(sk)->defer_connect)
            goto out;

        /* Just entered SS_CONNECTING state; the only
         * difference is that return value in non-blocking
         * case is EINPROGRESS, rather than EALREADY.
         */
        err = -EINPROGRESS;
        break;
    }

    timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

    if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {  //客户端发出请求连接后,等待服务端的响应。
        int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
                tcp_sk(sk)->fastopen_req &&
                tcp_sk(sk)->fastopen_req->data ? 1 : 0;

        /* Error code is set above */
        if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))  //等待第二次连接
            goto out;

        err = sock_intr_errno(timeo);
        if (signal_pending(current))
            goto out;
    }

    /* Connection was closed by RST, timeout, ICMP error
     * or another process disconnected us.
     */
    if (sk->sk_state == TCP_CLOSE)
        goto sock_error;

    /* sk->sk_err may be not zero now, if RECVERR was ordered by user
     * and error was received after socket entered established state.
     * Hence, it is handled normally after connect() return successfully.
     */

    sock->state = SS_CONNECTED;    //连接建立成功
    err = 0;
out:
    return err;

sock_error:
    err = sock_error(sk) ? : -ECONNABORTED;
    sock->state = SS_UNCONNECTED;
    if (sk->sk_prot->disconnect(sk, flags))
        sock->state = SS_DISCONNECTING;
    goto out;
}

2.accept

调用accept系统调用时,具体调用的是__sys_accept4函数。

int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
          int __user *upeer_addrlen, int flags)
{
    struct socket *sock, *newsock;
    struct file *newfile;
    int err, len, newfd, fput_needed;
    struct sockaddr_storage address;

    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    sock = sockfd_lookup_light(fd, &err, &fput_needed);  //查找服务器端监听的socket
    if (!sock)
        goto out;

    err = -ENFILE;
    newsock = sock_alloc();  //分配一个socket
    if (!newsock)
        goto out_put;

    newsock->type = sock->type;
    newsock->ops = sock->ops;

    /*
     * We don't need try_module_get here, as the listening socket (sock)
     * has the protocol module (sock->ops->owner) held.
     */
    __module_get(newsock->ops->owner);

    newfd = get_unused_fd_flags(flags);
    if (unlikely(newfd < 0)) {
        err = newfd;
        sock_release(newsock);
        goto out_put;
    }
    newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
    if (IS_ERR(newfile)) {
        err = PTR_ERR(newfile);
        put_unused_fd(newfd);
        goto out_put;
    }

    err = security_socket_accept(sock, newsock);
    if (err)
        goto out_fd;

    err = sock->ops->accept(sock, newsock, sock->file->f_flags, false); //调用socket上对应的accept
    if (err < 0)
        goto out_fd;

    if (upeer_sockaddr) {
        len = newsock->ops->getname(newsock,
                    (struct sockaddr *)&address, 2);
        if (len < 0) {
            err = -ECONNABORTED;
            goto out_fd;
        }
        err = move_addr_to_user(&address,
                    len, upeer_sockaddr, upeer_addrlen);
        if (err < 0)
            goto out_fd;
    }

    /* File flags are not inherited via accept() unlike another OSes. */

    fd_install(newfd, newfile);
    err = newfd;

out_put:
    fput_light(sock->file, fput_needed);
out:
    return err;
out_fd:
    fput(newfile);
    put_unused_fd(newfd);
    goto out_put;
}

分配一个新的socket对象,这个socket的新对象当有新连接进来时,用于与客户端进行连接,然后调用inet_accept():

int inet_accept(struct socket *sock, struct socket *newsock, int flags,
        bool kern)
{
    struct sock *sk1 = sock->sk;
    int err = -EINVAL;
    struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); //调用inet_csk_accept

    if (!sk2)
        goto do_err;

    lock_sock(sk2);

    sock_rps_record_flow(sk2);
    WARN_ON(!((1 << sk2->sk_state) &
          (TCPF_ESTABLISHED | TCPF_SYN_RECV |
          TCPF_CLOSE_WAIT | TCPF_CLOSE)));

    sock_graft(sk2, newsock);

    newsock->state = SS_CONNECTED;  //建立连接
    err = 0;
    release_sock(sk2);
do_err:
    return err;
}

上面这个函数用于监听连接。

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct request_sock_queue *queue = &icsk->icsk_accept_queue;
    struct request_sock *req;
    struct sock *newsk;
    int error;

    lock_sock(sk);

    /* We need to make sure that this socket is listening,
     * and that it has something pending.
     */
    error = -EINVAL;
    if (sk->sk_state != TCP_LISTEN)
        goto out_err;

    /* Find already established connection */
    if (reqsk_queue_empty(queue)) {
        long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

        /* If this is a non blocking socket don't sleep */
        error = -EAGAIN;
        if (!timeo)
            goto out_err;

        error = inet_csk_wait_for_connect(sk, timeo);   //等待连接
        if (error)
            goto out_err;
    }
    req = reqsk_queue_remove(queue, sk);
    newsk = req->sk;

    if (sk->sk_protocol == IPPROTO_TCP &&
        tcp_rsk(req)->tfo_listener) {
        spin_lock_bh(&queue->fastopenq.lock);
        if (tcp_rsk(req)->tfo_listener) {
            /* We are still waiting for the final ACK from 3WHS
             * so can't free req now. Instead, we set req->sk to
             * NULL to signify that the child socket is taken
             * so reqsk_fastopen_remove() will free the req
             * when 3WHS finishes (or is aborted).
             */
            req->sk = NULL;
            req = NULL;
        }
        spin_unlock_bh(&queue->fastopenq.lock);
    }

此函数等待客户端的连接,作用于第二次握手的过程。

综上:服务器端先初始化Socket,然后与端口绑定(bind),对端口进行监听(listen),调用accept阻塞,等待客户端连接。连接成功,这时客户端与服务器端的连接就建立了。

上一篇:python multiprocessing模块


下一篇:深入理解TCP协议及其源代码