Linux网络解读(7) - TCP之connect

connect

sys_connect调用栈如下:

Linux网络解读(7) - TCP之connect

最终调用inetsw_array的inet_stream_ops中的connect方法。

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        int err;
        long timeo;
        lock_sock(sk);
        switch (sock->state) {
        default:
                err = -EINVAL;
                goto out;
        case SS_CONNECTED:
                err = -EISCONN;
                goto out;
        case SS_CONNECTING:
                err = -EALREADY;
                break;
        case SS_UNCONNECTED:
                err = -EISCONN;
                if (sk->sk_state != TCP_CLOSE)
                        goto out;
                        
                // tcp_v4_connect        
                err = sk->sk_prot->connect(sk, uaddr, addr_len);
                if (err < 0)
                        goto out;
                        
                // SS_CONNECTING 和 SS_CONNECTED的确别是,在non_blocking中返回值是 EINPROGRESS
                  sock->state = SS_CONNECTING;
                err = -EINPROGRESS;
                break;
        }
        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                if (!timeo || !inet_wait_for_connect(sk, timeo))
                        goto out;
                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;
        }
        // 连接被RST
        if (sk->sk_state == TCP_CLOSE)
                goto sock_error;
        /* sk->sk_err may be not zero now, if RECVERR was ordered by user
         * and error was received after socket entered established state.
         * Hence, it is handled normally after connect() return successfully.
         */
        sock->state = SS_CONNECTED;
        err = 0;
out:
        release_sock(sk);
        return err;
}

sk->sk_prot->connect指向tcp_v4_connect():

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct rtable *rt;
        u32 daddr, nexthop;
        int tmp;
        int err;
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;
        if (usin->sin_family != AF_INET)
                return -EAFNOSUPPORT;
        nexthop = daddr = usin->sin_addr.s_addr;
        if (inet->opt && inet->opt->srr) {
                if (!daddr)
                        return -EINVAL;
                nexthop = inet->opt->faddr;
        }
        // 查询路由表
        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                               IPPROTO_TCP,
                               inet->sport, usin->sin_port, sk);
        if (tmp < 0)
                return tmp;
        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
                ip_rt_put(rt);
                return -ENETUNREACH;
        }
        if (!inet->opt || !inet->opt->srr)
                daddr = rt->rt_dst;
        if (!inet->saddr)
                inet->saddr = rt->rt_src;
        inet->rcv_saddr = inet->saddr;
        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
                tp->rx_opt.ts_recent           = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                tp->write_seq                   = 0;
        }
        if (tcp_death_row.sysctl_tw_recycle &&
            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
                struct inet_peer *peer = rt_get_peer(rt);
                /* VJ's idea. We save last timestamp seen from
                 * the destination in peer table, when entering state TIME-WAIT
                 * and initialize rx_opt.ts_recent from it, when trying new connection.
                 */
                if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
                        tp->rx_opt.ts_recent = peer->tcp_ts;
                }
        }
        // 指定目的地址和端口
        inet->dport = usin->sin_port;
        inet->daddr = daddr;
        inet_csk(sk)->icsk_ext_hdr_len = 0;
        if (inet->opt)
                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
        tp->rx_opt.mss_clamp = 536;
        tcp_set_state(sk, TCP_SYN_SENT);
        
        // 给套接字绑定一个源端口,并且hash
        // 把sk插入到established表中
        err = inet_hash_connect(&tcp_death_row, sk);
        if (err)
                goto failure;
        err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
        if (err)
                goto failure;
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->u.dst);
        // 生成ISN
        if (!tp->write_seq)
                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
                                                           inet->daddr,
                                                           inet->sport,
                                                           usin->sin_port);
        inet->id = tp->write_seq ^ jiffies;
        // 非常重要!
        // 构造一个SYN包,然后发送。
        err = tcp_connect(sk);
        rt = NULL;
        if (err)
                goto failure;
        return 0;
failure:
        tcp_set_state(sk, TCP_CLOSE);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->dport = 0;
        return err;
}

inet_hash_connect 是个很重要的函数,给sk指定一个源端口,并且插入到established表中,分配源端口的过程中还要判断resue,tw等等。

以后会重点分析,这里先发connect的流程走完。

int tcp_connect(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
        tcp_connect_init(sk);
        // 分配skbuff
        buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
                
        skb_reserve(buff, MAX_TCP_HEADER);
        
        // 构造SYN包
        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
        TCP_ECN_send_syn(sk, tp, buff);
        TCP_SKB_CB(buff)->sacked = 0;
        skb_shinfo(buff)->gso_segs = 1;
        skb_shinfo(buff)->gso_size = 0;
        skb_shinfo(buff)->gso_type = 0;
        buff->csum = 0;
        tp->snd_nxt = tp->write_seq;
        TCP_SKB_CB(buff)->seq = tp->write_seq++;
        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
        // 设置发送的时间
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
        skb_header_release(buff);
        // 把要发送的skbuf插入到sk->sk_write_queue尾部
        __skb_queue_tail(&sk->sk_write_queue, buff);
        
        sk_charge_skb(sk, buff);
        tp->packets_out += tcp_skb_pcount(buff);
        // 发送skbuff
        // 发送函数是:icsk->icsk_af_ops = &ipv4_specific;
        tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
        tp->snd_nxt = tp->write_seq;
        tp->pushed_seq = tp->write_seq;
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
        
        // 定时器的设置!!!
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  Inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
}

如何发送一个tcp包

看一下如何发送一个TCP报文tcp_transmit_skb():

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_sock *inet;
        struct tcp_sock *tp;
        struct tcp_skb_cb *tcb;
        int tcp_header_size;
        struct tcphdr *th;
        int sysctl_flags;
        int err;
        BUG_ON(!skb || !tcp_skb_pcount(skb));
        if (icsk->icsk_ca_ops->rtt_sample)
                __net_timestamp(skb);
        if (likely(clone_it)) {
                if (unlikely(skb_cloned(skb)))
                        skb = pskb_copy(skb, gfp_mask);
                else
                        skb = skb_clone(skb, gfp_mask);
                if (unlikely(!skb))
                        return -ENOBUFS;
        }
        inet = inet_sk(sk);
        tp = tcp_sk(sk);
        tcb = TCP_SKB_CB(skb);
        tcp_header_size = tp->tcp_header_len;
        sysctl_flags = 0;
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
                // 如果是SYN包,设置相关options
                tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
                if(sysctl_tcp_timestamps) {
                        tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
                        sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
                }
                if (sysctl_tcp_window_scaling) {
                        tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
                        sysctl_flags |= SYSCTL_FLAG_WSCALE;
                }
                if (sysctl_tcp_sack) {
                        sysctl_flags |= SYSCTL_FLAG_SACK;
                        if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
                                tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
                }
        } else if (unlikely(tp->rx_opt.eff_sacks)) {
                tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
                                    (tp->rx_opt.eff_sacks *
                                     TCPOLEN_SACK_PERBLOCK));
        }
                
        if (tcp_packets_in_flight(tp) == 0)
                tcp_ca_event(sk, CA_EVENT_TX_START);
        // 构造tcp头部        
        th = (struct tcphdr *) skb_push(skb, tcp_header_size);
        skb->h.th = th;
        skb_set_owner_w(skb, sk);
        th->source                = inet->sport;
        th->dest                = inet->dport;
        th->seq                        = htonl(tcb->seq);
        th->ack_seq                = htonl(tp->rcv_nxt);
        *(((__u16 *)th) + 6)        = htons(((tcp_header_size >> 2) << 12) |
                                        tcb->flags);
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
                th->window        = htons(tp->rcv_wnd);
        } else {
                th->window        = htons(tcp_select_window(sk));
        }
        th->check                = 0;
        th->urg_ptr                = 0;
        if (unlikely(tp->urg_mode &&
                     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
                th->urg_ptr                = htons(tp->snd_up-tcb->seq);
                th->urg                        = 1;
        }
        if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
                // 构造tcp选项
                tcp_syn_build_options((__u32 *)(th + 1),
                                      tcp_advertise_mss(sk),
                                      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
                                      (sysctl_flags & SYSCTL_FLAG_SACK),
                                      (sysctl_flags & SYSCTL_FLAG_WSCALE),
                                      tp->rx_opt.rcv_wscale,
                                      tcb->when,
                                      tp->rx_opt.ts_recent);
        } else {
                tcp_build_and_update_options((__u32 *)(th + 1),
                                             tp, tcb->when);
                TCP_ECN_send(sk, tp, skb, tcp_header_size);
        }
        icsk->icsk_af_ops->send_check(sk, skb->len, skb);
        if (likely(tcb->flags & TCPCB_FLAG_ACK))
                tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
        if (skb->len != tcp_header_size)
                tcp_event_data_sent(tp, skb, sk);
        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
                TCP_INC_STATS(TCP_MIB_OUTSEGS);
        // 调用底层协议的发送函数
        // 此处是icsk->icsk_af_ops = &ipv4_specific
        // 这是在tcp_v4_init中设置的
        err = icsk->icsk_af_ops->queue_xmit(skb, 0);
        if (likely(err <= 0))
                return err;
        tcp_enter_cwr(sk);
        return err == NET_XMIT_CN ? 0 : err;
}

把TCP的SYN包交给IP层,

int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
{
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_options *opt = inet->opt;
        struct rtable *rt;
        struct iphdr *iph;
        rt = (struct rtable *) skb->dst;
        if (rt != NULL)
                goto packet_routed;
        rt = (struct rtable *)__sk_dst_check(sk, 0);
        if (rt == NULL) {
                u32 daddr;
                daddr = inet->daddr;
                if(opt && opt->srr)
                        daddr = opt->faddr;
                {
                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
                                            .nl_u = { .ip4_u =
                                                      { .daddr = daddr,
                                                        .saddr = inet->saddr,
                                                        .tos = RT_CONN_FLAGS(sk) } },
                                            .proto = sk->sk_protocol,
                                            .uli_u = { .ports =
                                                       { .sport = inet->sport,
                                                         .dport = inet->dport } } };
                        if (ip_route_output_flow(&rt, &fl, sk, 0))
                                goto no_route;
                }
                sk_setup_caps(sk, &rt->u.dst);
        }
        skb->dst = dst_clone(&rt->u.dst);
packet_routed:
        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
                goto no_route;
        /* OK, we know where to send it, allocate and build IP header. */
        iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
        *((__u16 *)iph)        = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
        iph->tot_len = htons(skb->len);
        if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
                iph->frag_off = htons(IP_DF);
        else
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
        iph->protocol = sk->sk_protocol;
        iph->saddr    = rt->rt_src;
        iph->daddr    = rt->rt_dst;
        skb->nh.iph   = iph;
        if (opt && opt->optlen) {
                iph->ihl += opt->optlen >> 2;
                ip_options_build(skb, opt, inet->daddr, rt, 0);
        }
        ip_select_ident_more(iph, &rt->u.dst, sk,
                             (skb_shinfo(skb)->gso_segs ?: 1) - 1);
        ip_send_check(iph);
        skb->priority = sk->sk_priority;
        // 注意: 这个地方会调用netfilter的机制
        // dst_output在路由系统中的mkroute_output中设置为ip_output()
        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
                       dst_output);
no_route:
        IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
        kfree_skb(skb);
        return -EHOSTUNREACH;
}

skbuf一次穿过各层协议,调用连如下:

dst_output() -> ip_output() -> ip_finish_output2() -> dev_queue_xmit():

q = rcu_dereference(dev->qdisc);
        if (q->enqueue) {
                spin_lock(&dev->queue_lock);
                
                rc = q->enqueue(skb, q);
                qdisc_run(dev);
                spin_unlock(&dev->queue_lock);
                rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
                goto out;
        }
        dev->hard_start_xmit(skb, dev)

可以看到skbuf被插入到了net_device的qdisc中,最后调用驱动hard_start_xmit。

hard_start_xmit是各个网卡驱动里实现的回调。

大致的过程为:在net_device->base_addr上把要发送的skbuf按照驱动的CMD和字节顺序拼接上去:

比如'TX_CMD_PORT:send_cmd|TX_LEN_PORT:skb->len|TX_FRAME_PORT:skb->data'

这样驱动拿到这块字节就知道如何解析,该怎么发送了。驱动的代码是不是很简单。

到这里可以认为这个SYN包已经发出去了(只是发送到驱动的buf里),下面看看tcp_connect在发送出去一个包后,启动的定时器。

定时器

tcp_connect()
{
        // 发送skbuff
        // 发送函数是:icsk->icsk_af_ops = &ipv4_specific;
        tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
        tp->snd_nxt = tp->write_seq;
        tp->pushed_seq = tp->write_seq;
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
        
        // 定时器的设置!!!
         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

tcp 中有4种定时器:

#define ICSK_TIME_RETRANS1/* Retransmit timer */
#define ICSK_TIME_DACK2/* Delayed ack timer */
#define ICSK_TIME_PROBE03/* Zero window probe timer */
#define ICSK_TIME_KEEPOPEN4/* Keepalive timer */

在发送tcp数据中用到的是重传定时器。

定时器的设置是在套接字初始化过程中进行的:

void tcp_init_xmit_timers(struct sock *sk)
{
        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
                                  &tcp_keepalive_timer);
}

可见,重传定时器的回调是tcp_write_timer:

static void tcp_write_timer(unsigned long data)
{
        struct sock *sk = (struct sock*)data;
        struct inet_connection_sock *icsk = inet_csk(sk);
        int event;
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
                goto out_unlock;
        }
        if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
                goto out;
        if (time_after(icsk->icsk_timeout, jiffies)) {
                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
                goto out;
        }
        event = icsk->icsk_pending;
        icsk->icsk_pending = 0;
        // 根据event类型决定如何处理重传
        switch (event) {
        case ICSK_TIME_RETRANS:
                // tcp重传的逻辑,涉及TCP协议细节,后面专题分析
                // TODO
                tcp_retransmit_timer(sk);
                break;
        case ICSK_TIME_PROBE0:
                tcp_probe_timer(sk);
                break;
        }
        TCP_CHECK_TIMER(sk);
out:
        sk_stream_mem_reclaim(sk);
out_unlock:
        bh_unlock_sock(sk);
        sock_put(sk);
}

到此,connect发送一个SYN包的逻辑已经全部完成,下面是接收TCP包。

上一篇:seastar的用户态协议栈


下一篇:Thrift在C++中的使用