Linux网络解读(6) - TCP之listen,accept

TCP

总算到了TCP了。

TCP可以描述为一个无法选择确认也无法否认的滑动窗口协议。

TCP协议栈的初始化

tcp协议栈初始化的入口是在inet_init中
static int __init inet_init(void)
{
        rc = sk_alloc_slab(&tcp_prot, "tcp_sock");
        if (rc) {
                sk_alloc_slab_error(&tcp_prot);
                goto out;
        }
        rc = sk_alloc_slab(&udp_prot, "udp_sock");
        if (rc) {
                sk_alloc_slab_error(&udp_prot);
                goto out_tcp_free_slab;
        }
        rc = sk_alloc_slab(&raw_prot, "raw_sock");
        if (rc) {
                sk_alloc_slab_error(&raw_prot);
                goto out_udp_free_slab;
        }
          (void)sock_register(&inet_family_ops);
        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                inet_register_protosw(q);
        arp_init();
        ip_init();
        // tcp协议栈的初始化
        tcp_v4_init(&inet_family_ops);
        tcp_init();
        
}

tcp协议栈的初始化分两个步骤tcp_v4_init和tcp_init。

tcp_v4_init 初始化了RST的socket;真正做初始化的是tcp_init:

void __init tcp_init(void)
{
        struct sk_buff *skb = NULL;
        unsigned long limit;
        int order, i, max_share;
        if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
                                           sizeof(skb->cb));
                                           
        // 初始化tcp_hashinfo中的3个内存块bind_bucket_cachep,ehash,bhash
        tcp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("tcp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
        if (!tcp_hashinfo.bind_bucket_cachep)
                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
        tcp_hashinfo.ehash =
                alloc_large_system_hash("TCP established",
                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
                                        HASH_HIGHMEM,
                                        &tcp_hashinfo.ehash_size,
                                        NULL,
                                        0);
        tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
        for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
                rwlock_init(&tcp_hashinfo.ehash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
        }
        tcp_hashinfo.bhash =
                alloc_large_system_hash("TCP bind",
                                        sizeof(struct inet_bind_hashbucket),
                                        tcp_hashinfo.ehash_size,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
                                        HASH_HIGHMEM,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        64 * 1024);
        tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
        }
        for (order = 0; ((1 << order) << PAGE_SHIFT) <
                        (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
                        order++)
                ;
        if (order >= 4) {
                // 设置端口范围
                sysctl_local_port_range[0] = 32768;
                sysctl_local_port_range[1] = 61000;
                tcp_death_row.sysctl_max_tw_buckets = 180000;
                sysctl_tcp_max_orphans = 4096 << (order - 4);
                sysctl_max_syn_backlog = 1024;
        } else if (order < 3) {
                sysctl_local_port_range[0] = 1024 * (3 - order);
                tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
                sysctl_tcp_max_orphans >>= (3 - order);
                sysctl_max_syn_backlog = 128;
        }
        sysctl_tcp_mem[0] =  768 << order;
        sysctl_tcp_mem[1] = 1024 << order;
        sysctl_tcp_mem[2] = 1536 << order;
        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
        max_share = min(4UL*1024*1024, limit);
        sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
        sysctl_tcp_wmem[1] = 16*1024;
        sysctl_tcp_wmem[2] = max(64*1024, max_share);
        sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
        sysctl_tcp_rmem[1] = 87380;
        sysctl_tcp_rmem[2] = max(87380, max_share);
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %d bind %d)\n",
               tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
        // 注册拥塞函数       
        tcp_register_congestion_control(&tcp_reno);
}

tcp_hashinfo 是个重要的不能再重要的全局数据结构:

Linux网络解读(6) - TCP之listen,accept

tcp_hashinfo 里有3张hash表,分别存储3种类型的sock结构体:

1) ehash 链接建立成功的sock;

2) bhash bind()调用会把对应的sock插入到这样表,以便于检查端口冲突;

3) listening_hash 监听套接字

这个数据结构后面还会深入研究。

下面再看一下创建tcp套接字的初始化,这个函数到哪里找呢?

socket_layer ----> transport_layer 使用inet_protsw结构

transport_layer ----> ip_layer 使用 net_protocol结构

这两者都在inet_init中初始化的。

而创建一个套接字调用到inet_create()函数,在根据AF_INET,SOCK_STREAM找到一个tcp的inet_protsw项,然后调用sk->sk_prot->init(sk),

所以,创建tcp套接字时的初始化是在结构体tcp_prot中的tcp_v4_init_sock函数。

static int tcp_v4_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        skb_queue_head_init(&tp->out_of_order_queue);
        
        // 初始化timers
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
        tp->snd_cwnd = 2;
        tp->snd_ssthresh = 0x7fffffff;        /* Infinity */
        tp->snd_cwnd_clamp = ~0;
        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
        // 拥塞函数
        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
        sk->sk_state = TCP_CLOSE;
        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
        
        // icsk_af_ops这是tcp是运行在哪个协议之上,目前是IPV4
        icsk->icsk_af_ops = &ipv4_specific;
        icsk->icsk_sync_mss = tcp_sync_mss;
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
        atomic_inc(&tcp_sockets_allocated);
        return 0;
}

listen

前面说过listen()调用的是sock->ops->listen,而ops在inet_create中被赋值为inet_protosw的ops,tcp对应的是inet_stream_ops,所以listen最终调用到inet_listen

int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        unsigned char old_state;
        int err;
        lock_sock(sk);
        err = -EINVAL;
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;
        old_state = sk->sk_state;
        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                goto out;
        if (old_state != TCP_LISTEN) {
                err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
                if (err)
                        goto out;
        }
        sk->sk_max_ack_backlog = backlog;
        err = 0;
out:
        release_sock(sk);
        return err;
}

只有到sock结构体中的state为TCP_CLOSE才会进入真正的listen

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
        struct inet_sock *inet = inet_sk(sk);
        // 将sock结构体指向的结构体cast成描述tcp socket的结构体inet_connection_sock
        // 这是在inet_create中调用sk_alloc分配的,sk_alloc根据inet_protosw结构体中的objsize分配内存,每种协议都是不同的。
        struct inet_connection_sock *icsk = inet_csk(sk);
        // 初始化icsk_accept_queue,这是一个链表。
        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
        if (rc != 0)
                return rc;
        sk->sk_max_ack_backlog = 0;
        sk->sk_ack_backlog = 0;
        inet_csk_delack_init(sk);
        // 设置sock的sk_state为TCP_LISTEN
        sk->sk_state = TCP_LISTEN;
        if (!sk->sk_prot->get_port(sk, inet->num)) {
                inet->sport = htons(inet->num);
                sk_dst_reset(sk);
                // 这个hash函数是tcp_prot中的tcp_v4_hash
                // 会根据sock的状态把sock放入全局的tcp_hashinfo结构体中:
                //     如果是TCP_LISTEN,则被会被插入到tcp_hashinfo->listening_hash中;
                //     否则,被插入到inet_ehash_bucket中。
                sk->sk_prot->hash(sk);
                return 0;
        }
        sk->sk_state = TCP_CLOSE;
        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
        return -EADDRINUSE;
}

从上面的代码可以看到listen的开销在于:

1) 初始化sock_state;

2) 初始化icsk_accept_queue;

3) 把sock插入到tcp_hashinfo->listening_hash.

完成listen之后,会把sock结构体加入到tcp_hashinfo的bind_hash和listening_hash两张hash表中。

accept

accept最终diao
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len, newfd, fput_needed;
        char address[MAX_SOCK_ADDR];
        // 找到fd对应的监听套接字
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
        err = -ENFILE;
        // 给新的连接分配socket
        if (!(newsock = sock_alloc())) 
                goto out_put;
        newsock->type = sock->type;
        newsock->ops = sock->ops;
        __module_get(newsock->ops->owner);
        // 给新的连接分配fd
        newfd = sock_alloc_fd(&newfile);
        if (unlikely(newfd < 0)) {
                err = newfd;
                sock_release(newsock);
                goto out_put;
        }
        // 把新的fd和新的套接字关联起来
        err = sock_attach_fd(newsock, newfile);
        if (err < 0)
                goto out_fd;
        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;
        // sock->ops->accept开始accept
        // ops是inet_stream_ops
        err = sock->ops->accept(sock, newsock, sock->file->f_flags);
        if (err < 0)
                goto out_fd;
        if (upeer_sockaddr) {
                if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }
        fd_install(newfd, newfile);
        err = newfd;
        security_socket_post_accept(sock, newsock);
}

Linux网络解读(6) - TCP之listen,accept

函数的跳转在上面这张图中,重点来看最关键的函数inet_csk_accept:

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sock *newsk;
        int error;
        lock_sock(sk);
        error = -EINVAL;
        // 检查sk_state是否为TCP_LISTEN
        if (sk->sk_state != TCP_LISTEN)
                goto out_err;
        // 判断icsk_accept_queue队列是否为空:
        // 如果非空则说明已经有连接建立起来了,直接拿走就好。
        // 如果为空则说明此时还没有连接建立,需要阻塞的等待。
        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
                error = -EAGAIN;
                // 非阻塞的监听套接字直接返回
                if (!timeo)
                        goto out_err;
                        
                // 开始等待        
                error = inet_csk_wait_for_connect(sk, timeo);
                if (error)
                        goto out_err;
        }
        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
        release_sock(sk);
        return newsk;
out_err:
        newsk = NULL;
        *err = error;
        goto out;
}
上一篇:Thrift在C++中的使用


下一篇:TLA+ Specifying System (2)