一、TCP连接建立的三次握手过程
TCP提供一种面向连接的、可靠的字节流服务。面向连接意味着两个使用TCP的应用(通常是一个客户和一个服务器)在彼此交换数据包之前必须先建立一个TCP连接。
三种状态
SYN_SENT
在服务端监听后,客户端 socket 执行 connect 连接时,客户端发送 SYN 报文,此时客户端就进入 SYN_SENT 状态,等待服务端的确认。
SYN_RCVD
表示服务端接受到了 SYN 报文,在正常情况下,这个状态是服务器端的 socket 在建立 TCP 连接时的三次握手会话过程中的一个中间状态,很短暂,因为一般来说会立即回复一个 ACK ,当收到客户端的 ACK 报文后,它会进入到 ESTABLISHED 状态。
ESTABLISHED
表示连接已经建立了。
三次握手的过程
(1)第一次握手:Client将标志位SYN置为1,随机产生一个值seq=J,并将该数据包发送给Server,Client进入SYN_SENT状态,等待Server确认。
(2)第二次握手:Server收到数据包后由标志位SYN=1知道Client请求建立连接,Server将标志位SYN和ACK都置为1,ack=J+1,随机产生一个值seq=K,并将该数据包发送给Client以确认连接请求,Server进入SYN_RCVD状态。
(3)第三次握手:Client收到确认后,检查ack是否为J+1,ACK是否为1,如果正确则将标志位ACK置为1,ack=K+1,并将该数据包发送给Server,Server检查ack是否为K+1,ACK是否为1,如果正确则连接建立成功,Client和Server进入ESTABLISHED状态,完成三次握手,随后Client与Server之间可以开始传输数据了。
经过上面三个步骤交换三个报文段之后, 就建立起了一个 TCP 连接, 这也被称为 三次握手
, 三次握手的重点在于每次报文段的交换过程中, 都在数据包里包含了特殊的信息 :各自的初始序列号
, 简单来说, 在经历了:
的过程, 完成了 TCP 的建立, 完成了三次握手, 而三次握手的本质, 握的是各自的序列号。
二、TCP相关的接口定义
由前面的实验可知,可以通过追踪与TCP连接相关的socket、connect、listen、accept函数的系统调用,我们可以发现相应的socket系统调用函数都在 net/socket.c目录下,其中socket接口函数都定义在SYSCALL_DEFINE接口里,找到主要的相关SYSCALL_DEFINE定义如下:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { return __sys_socket(family, type, protocol); } SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { return __sys_bind(fd, umyaddr, addrlen); } SYSCALL_DEFINE2(listen, int, fd, int, backlog) { return __sys_listen(fd, backlog); } SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { return __sys_connect(fd, uservaddr, addrlen); } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getsockname(fd, usockaddr, usockaddr_len); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getpeername(fd, usockaddr, usockaddr_len); } SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { return __sys_sendto(fd, buff, len, flags, NULL, 0); } SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); }
三、探究TCP三次握手过程
我们通过查看源代码来分析三次握手的详细过程:
1、第一次握手:
客户端发起SYN请求,是通过调用connect发起连接,源码如下:
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; //得到socket对象 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; //将地址对象从用户空间拷贝到内核空间 err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) goto out_put; //内核相关 err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) goto out_put; //对于流式套接字,sock->ops为 inet_stream_ops -->inet_stream_connect //对于数据报套接字,sock->ops为 inet_dgram_ops --> inet_dgram_connect err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags); out_put: fput_light(sock->file, fput_needed); out: return err; }
该函数一共做了三件事:
(1)根据文件描述符找到指定的socket对象;
(2)将地址信息从用户空间拷贝到内核空间;
(3)调用指定类型套接字的connect函数。
对应流式套接字的connect函数是inet_stream_connect,如下:
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags); release_sock(sock->sk); return err; } /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ //1. 检查socket地址长度和使用的协议族。 //2. 检查socket的状态,必须是SS_UNCONNECTED或SS_CONNECTING。 //3. 调用tcp_v4_connect()来发送SYN包。 //4. 等待后续握手的完成: int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; int err; long timeo; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; //检查协议族 if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: //已经是连接状态 err = -EISCONN; goto out; case SS_CONNECTING: //正在连接 err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; //对于流式套接字,sock->ops为 inet_stream_ops --> inet_stream_connect --> tcp_prot --> tcp_v4_connect //对于数据报套接字,sock->ops为 inet_dgram_ops --> inet_dgram_connect --> udp_prot --> ip4_datagram_connect err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; //协议方面的工作已经处理完成了,但是自己的一切工作还没有完成,所以切换至正在连接中 sock->state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } //获取阻塞时间timeo。如果socket是非阻塞的,则timeo是0 //connect()的超时时间为sk->sk_sndtimeo,在sock_init_data()中初始化为MAX_SCHEDULE_TIMEOUT,表示无限等待,可以通过SO_SNDTIMEO选项来修改 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; /* Error code is set above */ //如果socket是非阻塞的,那么就直接返回错误码-EINPROGRESS。 //如果socket为阻塞的,就调用inet_wait_for_connect(),通过睡眠来等待。在以下三种情况下会被唤醒: //(1) 使用SO_SNDTIMEO选项时,睡眠时间超过设定值,返回0。connect()返回错误码-EINPROGRESS。 //(2) 收到信号,返回剩余的等待时间。connect()返回错误码-ERESTARTSYS或-EINTR。 //(3) 三次握手成功,sock的状态从TCP_SYN_SENT或TCP_SYN_RECV变为TCP_ESTABLISHED, if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); //进程收到信号,如果err为-ERESTARTSYS,接下来库函数会重新调用connect() if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ //更新socket状态为连接已建立 sock->state = SS_CONNECTED; //清除错误码 err = 0; out: return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; //如果使用的是TCP,则sk_prot为tcp_prot,disconnect为tcp_disconnect() if (sk->sk_prot->disconnect(sk, flags)) //如果失败 sock->state = SS_DISCONNECTING; goto out; }
下面是tcp_connect函数的源代码:
/* Build a SYN and send it off. */ //由tcp_v4_connect()->tcp_connect()->tcp_transmit_skb()发送,并置为TCP_SYN_SENT. int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; //初始化传输控制块中与连接相关的成员 tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } //分配skbuff --> 为SYN段分配报文并进行初始化 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; //构建syn报文 //在函数tcp_v4_connect中write_seq已经被初始化随机值 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; //将报文添加到发送队列上 tcp_connect_queue_skb(sk, buff); //显式拥塞通告 ---> //路由器在出现拥塞时通知TCP。当TCP段传递时,路由器使用IP首部中的2位来记录拥塞,当TCP段到达后, //接收方知道报文段是否在某个位置经历过拥塞。然而,需要了解拥塞发生情况的是发送方,而非接收方。因 //此,接收方使用下一个ACK通知发送方有拥塞发生,然后,发送方做出响应,缩小自己的拥塞窗口。 tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : //构造tcp头和ip头并发送 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ //启动重传定时器 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; }
2、第二次握手:服务端接收SYN并处理
数据到达网卡的时候,对于TCP协议,将大致要经过这个一个调用链:
网卡驱动 ---> netif_receive_skb() ---> ip_rcv() ---> ip_local_deliver_finish() ---> tcp_v4_rcv()
下面直接看tcp_v4_rcv():
/* * From tcp_input.c */ //网卡驱动-->netif_receive_skb()--->ip_rcv()--->ip_local_deliver_finish()---> tcp_v4_rcv() int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; struct sock *sk; int ret; //如果不是发往本地的数据包,则直接丢弃 if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); ////包长是否大于TCP头的长度 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; //tcp头 --> 不是很懂为何老是获取tcp头 th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; //得到tcp的头 --> 不是很懂为何老是获取tcp头 th = (const struct tcphdr *)skb->data; //得到ip报文头 iph = ip_hdr(skb); /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() * barrier() makes sure compiler wont play fool^Waliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), sizeof(struct inet_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; lookup: //根据源端口号,目的端口号和接收的interface查找sock对象------>先在建立连接的哈希表中查找------>如果没找到就从监听哈希表中找 //对于建立过程来讲肯是监听哈希表中才能找到 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, &refcounted); //如果找不到处理的socket对象,就把数据报丢掉 if (!sk) goto no_tcp_socket; process: //检查sock是否处于半关闭状态 if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); struct sock *nsk; sk = req->rsk_listener; if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } /* We own a reference on the listener, increase it again * as we might lose it too soon. */ sock_hold(sk); refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; } else { sock_put(sk); return 0; } } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; nf_reset(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; //tcp头部 --> 不是很懂为何老是获取tcp头 th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); skb->dev = NULL; //如果socket处于监听状态 --> 我们重点关注这里 if (sk->sk_state == TCP_LISTEN) { ret = tcp_v4_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; //查看是否有用户态进程对该sock进行了锁定 //如果sock_owned_by_user为真,则sock的状态不能进行更改 if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) //--------------------------------------------------------> ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret; no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; if (tcp_checksum_complete(skb)) { csum_error: __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } discard_it: /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; refcounted = false; goto process; } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it; }
调用tcp_v4_send_synack回复客户端ack。
//向客户端发送SYN+ACK报文 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; /* First, grab a route. */ //查找到客户端的路由 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; //根据路由、传输控制块、连接请求块中的构建SYN+ACK段 skb = tcp_make_synack(sk, dst, req, foc, synack_type); //生成SYN+ACK段成功 if (skb) { //生成校验码 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); //生成IP数据报并发送出去 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); } return err; }
3、第三次握手:客户端收到SYN+ACK报文并回复。
case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; //处理SYN_SENT状态下接收到的TCP段 queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; /* Do step6 onward by hand. */ //处理完第二次握手后,还需要处理带外数据 tcp_urg(sk, skb, th); __kfree_skb(skb); //检测是否有数据需要发送 tcp_data_snd_check(sk); return 0; }
4、服务器端接收ACK报文
最后服务器端还需要接受ACK报文并进入到ESTABLISHED状态,三次握手才算最终完成。
case TCP_SYN_RECV:
if (!acceptable)
return 1;
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
/* Once we leave TCP_SYN_RECV, we no longer need req
* so release it.
*/
if (req) {
inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
} else {
/* Make sure socket is routed, for correct metrics. */
//建立路由,初始化拥塞控制模块
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
tp->copied_seq = tp->rcv_nxt;
tcp_init_buffer_space(sk);
}
smp_mb();
//正常的第三次握手,设置连接状态为TCP_ESTABLISHED
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
//状态已经正常,唤醒那些等待的线程
if (sk->sk_socket)
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
if (req) {
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more aggressive and
* retransmitting any data sooner based on when they
* are sent out.
*/
tcp_rearm_rto(sk);
} else
tcp_init_metrics(sk);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
//更新最近一次发送数据包的时间
tp->lsndtime = tcp_time_stamp;
tcp_initialize_rcv_mss(sk);
//计算有关TCP首部预测的标志
tcp_fast_path_on(tp);
break;