Linux内核TCP三次握手过程及在wireshark中的观测

Linux内核TCP三次握手过程及在wireshark中的观测

0.前言

TCP三次握手过程

从应用角度看TCP三次握手过程

图片出处:[内核源码] 网络协议栈 - tcp 三次握手状态

1. Linux内核中TCP三次握手的过程

内核版本:5.10.104

1.1 客户端connect(SYN的发出)

客户端使用socket API中的connect函数来进行TCP三次握手。

1
2
3
4
5
6
7
// net/socket.c
// connect系统调用,发起三次握手
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
return __sys_connect(fd, uservaddr, addrlen);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// net/socket.c
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
// ...
// 地址从用户空间拷贝到内核空间
ret = move_addr_to_kernel(uservaddr, addrlen, &address);
if (!ret)
ret = __sys_connect_file(f.file, &address, addrlen, 0); // 开始连接
fdput(f);
// ...
}

int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
int addrlen, int file_flags)
{
// ...
err =
security_socket_connect(sock, (struct sockaddr *)address, addrlen); // 没启用
if (err)
goto out;
// 函数指针,TCP协议,选择TCP的connect处理函数
err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
sock->file->f_flags | file_flags);
out:
return err;
}

我们选择的是TCP协议,因此在sock->ops中的connect实际上是指向了tcp_v4_connect,代码如下。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect, // connect连接处理函数
.disconnect = tcp_disconnect,
.accept = inet_csk_accept, // accept处理函数
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
// ...
}

tcp_v4_connect中,通过tcp_set_state设置socket状态为TCP_SYN_SENT,并在tcp_connect中构造完整报文,并发送。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// net/ipv4/tcp_ipv4.c
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
// ...
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT); // 设置socket状态为TCP_SYN_SENT
// ...
err = tcp_connect(sk); // 构建完整的syn报文,并发送
// ...
}

tcp_connect中,构造了一个SYN包,并在tcp_send_syn_data函数中将包添加到发送队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// net/ipv4/tcp_output.c
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
// ...
// 申请 skb,并构造为一个SYN包
buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
// ...
/* Send off SYN; include data in Fast Open. */
// 添加到发送队列
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if (err == -ECONNREFUSED)
return err;
// ...
/* Timer for repeating the SYN until an answer. */
// 启动重传计时器
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
EXPORT_SYMBOL(tcp_connect);

使用tcp_transmit_skb函数将包发送出去

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// net/ipv4/tcp_output.c
/* Build and send a SYN with data and (cached) Fast Open cookie. However,
* queue a data-only packet after the regular SYN, such that regular SYNs
* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
* only the SYN sequence, the data are retransmitted in the first ACK.
* If cookie is not cached or other error occurs, falls back to send a
* regular SYN with Fast Open cookie request option.
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
// ...
// 发送SYN报文
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
// ...
/* data was not sent, put it in write_queue */
__skb_queue_tail(&sk->sk_write_queue, syn_data); // 没发送,添加到发送队列
tp->packets_out -= tcp_skb_pcount(syn_data);
}

tcp_transmit_skb调用到__tcp_transmit_skb,完成TCP header的构建后,调用ip_queue_xmit,移交IP层处理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// net/ipv4/tcp_output.c
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
// ...
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
// 建立连接时的SYN包, TCP options构造
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
} else {
// 已连接的TCP连接, TCP options构造
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
// ...
/* BPF prog is the last one writing header option */
// 用BPF技术也可以造TCP options包(开启相关配置)
bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
// ...
err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
inet6_csk_xmit, ip_queue_xmit, // 发送到IP层
sk, skb, &inet->cork.fl);
}

1.2 服务端accept(SYN+ACK的发出)

==TODO==

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// net/ipv4/tcp_ipv4.c
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
// ...
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_cookie_check(sk, skb);

if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) { // 被动监听模式
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);

if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// net/ipv4/tcp_minisocks.c
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
*
* For the vast majority of cases child->sk_state will be TCP_SYN_RECV
* when entering. But other states are possible due to a race condition
* where after __inet_lookup_established() fails but before the listener
* locked is obtained, other packets cause the same connection to
* be created.
*/

int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
__releases(&((child)->sk_lock.slock))
{
int ret = 0;
int state = child->sk_state;

/* record NAPI ID of child */
sk_mark_napi_id(child, skb);

tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
__sk_add_backlog(child, skb);
}

bh_unlock_sock(child);
sock_put(child);
return ret;
}
EXPORT_SYMBOL(tcp_child_process);

tcp_rcv_state_process中的函数tcp_rcv_synsent_state_process中, 通过tcp_parse_options解析TCP options。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// net/ipv4/tcp_input.c
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
// ...
case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
tcp_mstamp_refresh(tp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
}

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
// ...
// 解析TCP options
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
}

send_synack的实际执行函数为tcp_v4_send_synack

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// net/ipv4/tcp_ipv4.c
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
.init_req = tcp_v4_init_req,
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v4_init_sequence,
#endif
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_seq,
.init_ts_off = tcp_v4_init_ts_off,
.send_synack = tcp_v4_send_synack, // syncak处理函数
};

构造 SYN-ACK包,并发出。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// net/ipv4/tcp_ipv4.c
/*
* Send a SYN-ACK after having received a SYN.
* This still operates on a request_sock only, not on a big
* socket.
*/
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
// ...
// 构造 SYN-ACK包
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
// ...
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt),
tos);
// ...
}

1.3 服务端accept(客户端 ACK的发出)

==TODO==

1
2
3
4
5
6
7
8
9
10
11
// net/ipv4/tcp_ipv4.c
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
outside socket context is ugly, certainly. What can I do?
*/

static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
{

2. 在wireshark中观测三次握手

客户端服务端发起连接(第一次)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
Transmission Control Protocol, Src Port: 49630, Dst Port: 19780, Seq: 0, Len: 0
Source Port: 49630 # 源端口 客户端
Destination Port: 19780 # 目标端口 服务端
[Stream index: 4]
[TCP Segment Len: 0]
Sequence number: 0 (relative sequence number) # 相对序列号
Sequence number (raw): 3180787438 # 序列号
[Next sequence number: 1 (relative sequence number)]
Acknowledgment number: 0 # 确认应答号
Acknowledgment number (raw): 0
1010 .... = Header Length: 40 bytes (10)
Flags: 0x002 (SYN)
000. .... .... = Reserved: Not set
...0 .... .... = Nonce: Not set
.... 0... .... = Congestion Window Reduced (CWR): Not set
.... .0.. .... = ECN-Echo: Not set
.... ..0. .... = Urgent: Not set
.... ...0 .... = Acknowledgment: Not set
.... .... 0... = Push: Not set
.... .... .0.. = Reset: Not set
.... .... ..1. = Syn: Set # SYN = 1 即希望建立连接
.... .... ...0 = Fin: Not set
[TCP Flags: ··········S·]
Window size value: 65495
[Calculated window size: 65495]
Checksum: 0xfe30 [unverified]
[Checksum Status: Unverified]
Urgent pointer: 0
Options: (20 bytes), Maximum segment size, SACK permitted, Timestamps, No-Operation (NOP), Window scale
TCP Option - Maximum segment size: 65495 bytes # 默认 MSS大小
TCP Option - SACK permitted
TCP Option - Timestamps: TSval 1728527822, TSecr 0 # 记录 tcp timestamp
Kind: Time Stamp Option (8)
Length: 10
Timestamp value: 1728527822
Timestamp echo reply: 0
TCP Option - No-Operation (NOP)
TCP Option - Window scale: 7 (multiply by 128)
Kind: Window Scale (3)
Length: 3
Shift count: 7
[Multiplier: 128]
[Timestamps]
[Time since first frame in this TCP stream: 0.000000000 seconds]
[Time since previous frame in this TCP stream: 0.000000000 seconds]

服务端客户端发送ACK(第二次)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Transmission Control Protocol, Src Port: 19780, Dst Port: 49630, Seq: 0, Ack: 1, Len: 0
Source Port: 19780 # 源端口 服务端
Destination Port: 49630 # 目标端口 客户端
[Stream index: 4]
[TCP Segment Len: 0]
Sequence number: 0 (relative sequence number) # 序列号
Sequence number (raw): 1952202579
[Next sequence number: 1 (relative sequence number)]
Acknowledgment number: 1 (relative ack number) # 确认应答号,0 + 1
Acknowledgment number (raw): 3180787439
1010 .... = Header Length: 40 bytes (10)
Flags: 0x012 (SYN, ACK)
000. .... .... = Reserved: Not set
...0 .... .... = Nonce: Not set
.... 0... .... = Congestion Window Reduced (CWR): Not set
.... .0.. .... = ECN-Echo: Not set
.... ..0. .... = Urgent: Not set
.... ...1 .... = Acknowledgment: Set # ack = 1
.... .... 0... = Push: Not set
.... .... .0.. = Reset: Not set
.... .... ..1. = Syn: Set # syn = 1
.... .... ...0 = Fin: Not set
[TCP Flags: ·······A··S·]
Window size value: 65483
[Calculated window size: 65483]
Checksum: 0xfe30 [unverified]
[Checksum Status: Unverified]
Urgent pointer: 0
Options: (20 bytes), Maximum segment size, SACK permitted, Timestamps, No-Operation (NOP), Window scale
TCP Option - Maximum segment size: 65495 bytes
TCP Option - SACK permitted
# tcp timestamp
TCP Option - Timestamps: TSval 1728527822, TSecr 1728527822
Kind: Time Stamp Option (8)
Length: 10
Timestamp value: 1728527822
Timestamp echo reply: 1728527822
TCP Option - No-Operation (NOP)
TCP Option - Window scale: 7 (multiply by 128)
Kind: Window Scale (3)
Length: 3
Shift count: 7
[Multiplier: 128]
[SEQ/ACK analysis]
[This is an ACK to the segment in frame: 9]
[The RTT to ACK the segment was: 0.000037392 seconds]
[iRTT: 0.000068221 seconds]
[Timestamps]
[Time since first frame in this TCP stream: 0.000037392 seconds]
[Time since previous frame in this TCP stream: 0.000037392 seconds]

客户端响应服务端(第三次)

第三次可携带数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
Transmission Control Protocol, Src Port: 49630, Dst Port: 19780, Seq: 1, Ack: 1, Len: 0
Source Port: 49630 # 源端口,客户端
Destination Port: 19780 # 目标端口, 服务端
[Stream index: 4]
[TCP Segment Len: 0]
Sequence number: 1 (relative sequence number) # 序列号
Sequence number (raw): 3180787439
[Next sequence number: 1 (relative sequence number)]
Acknowledgment number: 1 (relative ack number) # 确认应答号
Acknowledgment number (raw): 1952202580
1000 .... = Header Length: 32 bytes (8)
Flags: 0x010 (ACK)
000. .... .... = Reserved: Not set
...0 .... .... = Nonce: Not set
.... 0... .... = Congestion Window Reduced (CWR): Not set
.... .0.. .... = ECN-Echo: Not set
.... ..0. .... = Urgent: Not set
.... ...1 .... = Acknowledgment: Set # ack = 1
.... .... 0... = Push: Not set
.... .... .0.. = Reset: Not set
.... .... ..0. = Syn: Not set
.... .... ...0 = Fin: Not set
[TCP Flags: ·······A····]
Window size value: 512
[Calculated window size: 65536]
[Window size scaling factor: 128]
Checksum: 0xfe28 [unverified]
[Checksum Status: Unverified]
Urgent pointer: 0
Options: (12 bytes), No-Operation (NOP), No-Operation (NOP), Timestamps
TCP Option - No-Operation (NOP)
TCP Option - No-Operation (NOP)
TCP Option - Timestamps: TSval 1728527822, TSecr 1728527822
Kind: Time Stamp Option (8)
Length: 10
Timestamp value: 1728527822
Timestamp echo reply: 1728527822
[SEQ/ACK analysis]
[This is an ACK to the segment in frame: 10]
[The RTT to ACK the segment was: 0.000030829 seconds]
[iRTT: 0.000068221 seconds]
[Timestamps]
[Time since first frame in this TCP stream: 0.000068221 seconds]
[Time since previous frame in this TCP stream: 0.000030829 seconds]

3. 参考文档

  1. Linux的TCP实现之:三次握手

Linux内核TCP三次握手过程及在wireshark中的观测
http://ziyangfu.github.io/2023/04/19/从wireshark中看TCP三次握手/
作者
FZY
发布于
2023年4月19日
许可协议