connect
sys_connect调用栈如下:
最终调用inetsw_array的inet_stream_ops中的connect方法。
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; int err; long timeo; lock_sock(sk); switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: err = -EALREADY; break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; // tcp_v4_connect err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; // SS_CONNECTING 和 SS_CONNECTED的确别是,在non_blocking中返回值是 EINPROGRESS sock->state = SS_CONNECTING; err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (!timeo || !inet_wait_for_connect(sk, timeo)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } // 连接被RST if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; }
sk->sk_prot->connect指向tcp_v4_connect():
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct rtable *rt; u32 daddr, nexthop; int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; if (inet->opt && inet->opt->srr) { if (!daddr) return -EINVAL; nexthop = inet->opt->faddr; } // 查询路由表 tmp = ip_route_connect(&rt, nexthop, inet->saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, inet->sport, usin->sin_port, sk); if (tmp < 0) return tmp; if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; if (!inet->saddr) inet->saddr = rt->rt_src; inet->rcv_saddr = inet->saddr; if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); /* VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state TIME-WAIT * and initialize rx_opt.ts_recent from it, when trying new connection. */ if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } // 指定目的地址和端口 inet->dport = usin->sin_port; inet->daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; tcp_set_state(sk, TCP_SYN_SENT); // 给套接字绑定一个源端口,并且hash // 把sk插入到established表中 err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk); if (err) goto failure; sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->u.dst); // 生成ISN if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, inet->daddr, inet->sport, usin->sin_port); inet->id = tp->write_seq ^ jiffies; // 非常重要! // 构造一个SYN包,然后发送。 err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0; failure: tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->dport = 0; return err; }
inet_hash_connect 是个很重要的函数,给sk指定一个源端口,并且插入到established表中,分配源端口的过程中还要判断resue,tw等等。
以后会重点分析,这里先发connect的流程走完。
int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; tcp_connect_init(sk); // 分配skbuff buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; skb_reserve(buff, MAX_TCP_HEADER); // 构造SYN包 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; skb_shinfo(buff)->gso_segs = 1; skb_shinfo(buff)->gso_size = 0; skb_shinfo(buff)->gso_type = 0; buff->csum = 0; tp->snd_nxt = tp->write_seq; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; // 设置发送的时间 TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); // 把要发送的skbuf插入到sk->sk_write_queue尾部 __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); // 发送skbuff // 发送函数是:icsk->icsk_af_ops = &ipv4_specific; tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); // 定时器的设置!!! inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, Inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; }
如何发送一个tcp包
看一下如何发送一个TCP报文tcp_transmit_skb():
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; int tcp_header_size; struct tcphdr *th; int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); if (icsk->icsk_ca_ops->rtt_sample) __net_timestamp(skb); if (likely(clone_it)) { if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; } inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); tcp_header_size = tp->tcp_header_len; sysctl_flags = 0; if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { // 如果是SYN包,设置相关options tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } if (sysctl_tcp_window_scaling) { tcp_header_size += TCPOLEN_WSCALE_ALIGNED; sysctl_flags |= SYSCTL_FLAG_WSCALE; } if (sysctl_tcp_sack) { sysctl_flags |= SYSCTL_FLAG_SACK; if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } } else if (unlikely(tp->rx_opt.eff_sacks)) { tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); // 构造tcp头部 th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); th->source = inet->sport; th->dest = inet->dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { th->window = htons(tp->rcv_wnd); } else { th->window = htons(tcp_select_window(sk)); } th->check = 0; th->urg_ptr = 0; if (unlikely(tp->urg_mode && between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { th->urg_ptr = htons(tp->snd_up-tcb->seq); th->urg = 1; } if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { // 构造tcp选项 tcp_syn_build_options((__u32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rx_opt.rcv_wscale, tcb->when, tp->rx_opt.ts_recent); } else { tcp_build_and_update_options((__u32 *)(th + 1), tp, tcb->when); TCP_ECN_send(sk, tp, skb, tcp_header_size); } icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_INC_STATS(TCP_MIB_OUTSEGS); // 调用底层协议的发送函数 // 此处是icsk->icsk_af_ops = &ipv4_specific // 这是在tcp_v4_init中设置的 err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) return err; tcp_enter_cwr(sk); return err == NET_XMIT_CN ? 0 : err; }
把TCP的SYN包交给IP层,
int ip_queue_xmit(struct sk_buff *skb, int ipfragok) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; rt = (struct rtable *) skb->dst; if (rt != NULL) goto packet_routed; rt = (struct rtable *)__sk_dst_check(sk, 0); if (rt == NULL) { u32 daddr; daddr = inet->daddr; if(opt && opt->srr) daddr = opt->faddr; { struct flowi fl = { .oif = sk->sk_bound_dev_if, .nl_u = { .ip4_u = { .daddr = daddr, .saddr = inet->saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, .uli_u = { .ports = { .sport = inet->sport, .dport = inet->dport } } }; if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->u.dst); iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; skb->nh.iph = iph; if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, inet->daddr, rt, 0); } ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); ip_send_check(iph); skb->priority = sk->sk_priority; // 注意: 这个地方会调用netfilter的机制 // dst_output在路由系统中的mkroute_output中设置为ip_output() return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); no_route: IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; }
skbuf一次穿过各层协议,调用连如下:
dst_output() -> ip_output() -> ip_finish_output2() -> dev_queue_xmit():
q = rcu_dereference(dev->qdisc); if (q->enqueue) { spin_lock(&dev->queue_lock); rc = q->enqueue(skb, q); qdisc_run(dev); spin_unlock(&dev->queue_lock); rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; goto out; } dev->hard_start_xmit(skb, dev)
可以看到skbuf被插入到了net_device的qdisc中,最后调用驱动hard_start_xmit。
hard_start_xmit是各个网卡驱动里实现的回调。
大致的过程为:在net_device->base_addr上把要发送的skbuf按照驱动的CMD和字节顺序拼接上去:
比如'TX_CMD_PORT:send_cmd|TX_LEN_PORT:skb->len|TX_FRAME_PORT:skb->data'
这样驱动拿到这块字节就知道如何解析,该怎么发送了。驱动的代码是不是很简单。
到这里可以认为这个SYN包已经发出去了(只是发送到驱动的buf里),下面看看tcp_connect在发送出去一个包后,启动的定时器。
定时器
tcp_connect() { // 发送skbuff // 发送函数是:icsk->icsk_af_ops = &ipv4_specific; tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); // 定时器的设置!!! inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); }
tcp 中有4种定时器:
#define ICSK_TIME_RETRANS1/* Retransmit timer */ #define ICSK_TIME_DACK2/* Delayed ack timer */ #define ICSK_TIME_PROBE03/* Zero window probe timer */ #define ICSK_TIME_KEEPOPEN4/* Keepalive timer */
在发送tcp数据中用到的是重传定时器。
定时器的设置是在套接字初始化过程中进行的:
void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); }
可见,重传定时器的回调是tcp_write_timer:
static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct inet_connection_sock *icsk = inet_csk(sk); int event; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out_unlock; } if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } event = icsk->icsk_pending; icsk->icsk_pending = 0; // 根据event类型决定如何处理重传 switch (event) { case ICSK_TIME_RETRANS: // tcp重传的逻辑,涉及TCP协议细节,后面专题分析 // TODO tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: tcp_probe_timer(sk); break; } TCP_CHECK_TIMER(sk); out: sk_stream_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); }
到此,connect发送一个SYN包的逻辑已经全部完成,下面是接收TCP包。