TCP
总算到了TCP了。
TCP可以描述为一个无法选择确认也无法否认的滑动窗口协议。
TCP协议栈的初始化
tcp协议栈初始化的入口是在inet_init中
static int __init inet_init(void) { rc = sk_alloc_slab(&tcp_prot, "tcp_sock"); if (rc) { sk_alloc_slab_error(&tcp_prot); goto out; } rc = sk_alloc_slab(&udp_prot, "udp_sock"); if (rc) { sk_alloc_slab_error(&udp_prot); goto out_tcp_free_slab; } rc = sk_alloc_slab(&raw_prot, "raw_sock"); if (rc) { sk_alloc_slab_error(&raw_prot); goto out_udp_free_slab; } (void)sock_register(&inet_family_ops); if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); arp_init(); ip_init(); // tcp协议栈的初始化 tcp_v4_init(&inet_family_ops); tcp_init(); }
tcp协议栈的初始化分两个步骤tcp_v4_init和tcp_init。
tcp_v4_init 初始化了RST的socket;真正做初始化的是tcp_init:
void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; int order, i, max_share; if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), sizeof(skb->cb)); // 初始化tcp_hashinfo中的3个内存块bind_bucket_cachep,ehash,bhash tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.ehash_size, NULL, 0); tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.bhash_size, NULL, 64 * 1024); tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } for (order = 0; ((1 << order) << PAGE_SHIFT) < (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { // 设置端口范围 sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024 * (3 - order); tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; sysctl_tcp_mem[2] = 1536 << order; limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; sysctl_tcp_wmem[2] = max(64*1024, max_share); sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; sysctl_tcp_rmem[2] = max(87380, max_share); printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); // 注册拥塞函数 tcp_register_congestion_control(&tcp_reno); }
tcp_hashinfo 是个重要的不能再重要的全局数据结构:
tcp_hashinfo 里有3张hash表,分别存储3种类型的sock结构体:
1) ehash 链接建立成功的sock;
2) bhash bind()调用会把对应的sock插入到这样表,以便于检查端口冲突;
3) listening_hash 监听套接字
这个数据结构后面还会深入研究。
下面再看一下创建tcp套接字的初始化,这个函数到哪里找呢?
socket_layer ----> transport_layer 使用inet_protsw结构
transport_layer ----> ip_layer 使用 net_protocol结构
这两者都在inet_init中初始化的。
而创建一个套接字调用到inet_create()函数,在根据AF_INET,SOCK_STREAM找到一个tcp的inet_protsw项,然后调用sk->sk_prot->init(sk),
所以,创建tcp套接字时的初始化是在结构体tcp_prot中的tcp_v4_init_sock函数。
static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); // 初始化timers tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; tp->snd_cwnd = 2; tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; // 拥塞函数 icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); // icsk_af_ops这是tcp是运行在哪个协议之上,目前是IPV4 icsk->icsk_af_ops = &ipv4_specific; icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; atomic_inc(&tcp_sockets_allocated); return 0; }
listen
前面说过listen()调用的是sock->ops->listen,而ops在inet_create中被赋值为inet_protosw的ops,tcp对应的是inet_stream_ops,所以listen最终调用到inet_listen
int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; if (old_state != TCP_LISTEN) { err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); if (err) goto out; } sk->sk_max_ack_backlog = backlog; err = 0; out: release_sock(sk); return err; }
只有到sock结构体中的state为TCP_CLOSE才会进入真正的listen
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); // 将sock结构体指向的结构体cast成描述tcp socket的结构体inet_connection_sock // 这是在inet_create中调用sk_alloc分配的,sk_alloc根据inet_protosw结构体中的objsize分配内存,每种协议都是不同的。 struct inet_connection_sock *icsk = inet_csk(sk); // 初始化icsk_accept_queue,这是一个链表。 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); // 设置sock的sk_state为TCP_LISTEN sk->sk_state = TCP_LISTEN; if (!sk->sk_prot->get_port(sk, inet->num)) { inet->sport = htons(inet->num); sk_dst_reset(sk); // 这个hash函数是tcp_prot中的tcp_v4_hash // 会根据sock的状态把sock放入全局的tcp_hashinfo结构体中: // 如果是TCP_LISTEN,则被会被插入到tcp_hashinfo->listening_hash中; // 否则,被插入到inet_ehash_bucket中。 sk->sk_prot->hash(sk); return 0; } sk->sk_state = TCP_CLOSE; __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; }
从上面的代码可以看到listen的开销在于:
1) 初始化sock_state;
2) 初始化icsk_accept_queue;
3) 把sock插入到tcp_hashinfo->listening_hash.
完成listen之后,会把sock结构体加入到tcp_hashinfo的bind_hash和listening_hash两张hash表中。
accept
accept最终diao
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen) { struct socket *sock, *newsock; struct file *newfile; int err, len, newfd, fput_needed; char address[MAX_SOCK_ADDR]; // 找到fd对应的监听套接字 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = -ENFILE; // 给新的连接分配socket if (!(newsock = sock_alloc())) goto out_put; newsock->type = sock->type; newsock->ops = sock->ops; __module_get(newsock->ops->owner); // 给新的连接分配fd newfd = sock_alloc_fd(&newfile); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); goto out_put; } // 把新的fd和新的套接字关联起来 err = sock_attach_fd(newsock, newfile); if (err < 0) goto out_fd; err = security_socket_accept(sock, newsock); if (err) goto out_fd; // sock->ops->accept开始accept // ops是inet_stream_ops err = sock->ops->accept(sock, newsock, sock->file->f_flags); if (err < 0) goto out_fd; if (upeer_sockaddr) { if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) { err = -ECONNABORTED; goto out_fd; } err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen); if (err < 0) goto out_fd; } fd_install(newfd, newfile); err = newfd; security_socket_post_accept(sock, newsock); }
函数的跳转在上面这张图中,重点来看最关键的函数inet_csk_accept:
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *newsk; int error; lock_sock(sk); error = -EINVAL; // 检查sk_state是否为TCP_LISTEN if (sk->sk_state != TCP_LISTEN) goto out_err; // 判断icsk_accept_queue队列是否为空: // 如果非空则说明已经有连接建立起来了,直接拿走就好。 // 如果为空则说明此时还没有连接建立,需要阻塞的等待。 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); error = -EAGAIN; // 非阻塞的监听套接字直接返回 if (!timeo) goto out_err; // 开始等待 error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); out: release_sock(sk); return newsk; out_err: newsk = NULL; *err = error; goto out; }