TCP状态
enum {
TCP_ESTABLISHED = 1,
TCP_SYN_SENT,
TCP_SYN_RECV,
TCP_FIN_WAIT1,
TCP_FIN_WAIT2,
TCP_TIME_WAIT,
TCP_CLOSE,
TCP_CLOSE_WAIT,
TCP_LAST_ACK,
TCP_LISTEN,
TCP_CLOSING, /* Now a valid state */
TCP_MAX_STATES /* Leave at the end! */
};
inet_hashinfo
struct inet_hashinfo tcp_hashinfo;
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
* TIME_WAIT sockets use a separate chain (twchain).
*/
// TCP_ESTABLISHED、TCP_SYN_SENT、TCP_SYN_RECV、TCP_FIN_WAIT1、TCP_FIN_WAIT2、TCP_TIME_WAIT
struct inet_ehash_bucket *ehash;
spinlock_t *ehash_locks;
unsigned int ehash_size;
unsigned int ehash_locks_mask;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct inet_bind_hashbucket *bhash;
unsigned int bhash_size;
/* 4 bytes hole on 64 bit */
struct kmem_cache *bind_bucket_cachep;
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* might be often dirty.
*/
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
* is just local port number.
*/
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
____cacheline_aligned_in_smp;
atomic_t bsockets;
};
tcp_v4_rcv()
tcp_v4_rcv()是ip层到tcp层的入口
int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
struct tcphdr *th;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev);
// 若目的mac地址不是本机mac地址,丢包
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
// 检查data到tail的长度是否 >= sizeof(struct tcphdr)
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = tcp_hdr(skb); // tcp_hdr() -> skb_transport_header() -> skb->transport_header
if (th->doff < sizeof(struct tcphdr) / 4)
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
goto bad_packet;
th = tcp_hdr(skb);
iph = ip_hdr(skb); // ip_hdr() -> skb_network_header() -> skb->network_header
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->when = 0;
TCP_SKB_CB(skb)->flags = iph->tos;
TCP_SKB_CB(skb)->sacked = 0;
// 查找传输控制块,先查找ehash,后查找listening_hash
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb);
if (sk_filter(sk, skb))
goto discard_and_relse;
skb->dev = NULL;
bh_lock_sock_nested(sk); // 加锁
ret = 0;
if (!sock_owned_by_user(sk)) { // sk->sk_lock.owned,若没有用户进程访问传输控制块
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
#endif
{
// 若未启用tcp_low_latency,且用户进程正在访问传输控制块,将skb加入prequeue队列(return 1)
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb); // 处理有效TCP段
}
} else
sk_add_backlog(sk, skb);
bh_unlock_sock(sk); // 解锁
sock_put(sk);
return ret;
no_tcp_socket: // 若没找到传输控制块
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
bad_packet:
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb); // 发送rst
}
discard_it:
/* Discard frame. */
kfree_skb(skb);
return 0;
discard_and_relse:
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
inet_twsk_put(inet_twsk(sk));
sk = sk2;
goto process;
}
/* Fall through to ACK */
}
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
goto no_tcp_socket;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
__inet_lookup_skb()
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb,
const __be16 sport,
const __be16 dport)
{
struct sock *sk;
const struct iphdr *iph = ip_hdr(skb);
if (unlikely(sk = skb_steal_sock(skb)))
return sk;
else
return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
iph->saddr, sport,
iph->daddr, dport, inet_iif(skb));
}
static inline struct sock *__inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif)
{
u16 hnum = ntohs(dport); // 主机字节序dport
struct sock *sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif); // 先查找ehash
return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); // 后查找listening_hash
}
struct sock * __inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif)
{
// acookie = daddr << 32 | saddr
INET_ADDR_COOKIE(acookie, saddr, daddr)
// ports = hnum << 16 | sport
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk;
const struct hlist_nulls_node *node;
/* Optimize here for direct hit, only listening connections can
* have wildcards anyways.
*/
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
unsigned int slot = hash & (hashinfo->ehash_size - 1);
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (INET_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif)) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
goto begintw;
if (unlikely(!INET_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif))) {
sock_put(sk);
goto begin;
}
goto out;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begin;
begintw:
/* Must check for a TIME_WAIT'er before going to listener hash. */
sk_nulls_for_each_rcu(sk, node, &head->twchain) {
if (INET_TW_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif)) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
sk = NULL;
goto out;
}
if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif))) {
sock_put(sk);
goto begintw;
}
goto out;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begintw;
sk = NULL;
out:
rcu_read_unlock();
return sk;
}
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) const __addrpair __name = (__force __addrpair) ( \
(((__force __u64)(__be32)(__daddr)) << 32 ) | ((__force __u64)(__be32)(__saddr)) \
);
#define INET_COMBINED_PORTS(__sport, __dport) ( \
(__force __portpair) ( \
((__u32)(__dport) << 16 ) | (__force __u32)(__be16)(__sport)
)
)
#define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) ( \
((__sk)->sk_hash == (__hash)) && \
net_eq(sock_net(__sk), (__net)) && \
((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ // 在sock中,rcv_saddr紧跟daddr
((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ // 在sock中,num紧跟dport
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) \
)
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore;
rcu_read_lock();
begin:
result = NULL;
hiscore = -1;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) { // hiscore为score的最大值
result = sk;
hiscore = score;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
goto begin;
}
}
rcu_read_unlock();
return result;
}
/* net不同 or dport不同 or sk不支持ipv4:return -1
net相同 and dport相同 and sk支持ipv4:
sk不是ipv4:
dip为空、出设备为空:return 0
dip为空、出设备不为空:
出设备不同:return -1
出设备相同:return 2
dip不为空、出设备为空:
dip不同:return -1
dip相同:return 2
dip不为空、出设备不为空:
dip相同、出设备相同:return 4
dip相同、出设备不同:return -1
dip不同:return -1
sk是ipv4:
dip为空、出设备为空(最常见的情形):return 1
dip为空、出设备不为空:
出设备不同:return -1
出设备相同:return 3
dip不为空、出设备为空:
dip不同:return -1
dip相同:return 3
dip不为空、出设备不为空:
dip相同、出设备相同:return 5
dip相同、出设备不同:return -1
dip不同:return -1 */
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif)
{
int score = -1;
struct inet_sock *inet = inet_sk(sk);
if (net_eq(sock_net(sk), net) && inet->num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->rcv_saddr;
score = sk->sk_family == PF_INET ? 1 : 0;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
score += 2;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
}
return score;
}
tcp_v4_do_rcv()
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif
// 若是已连接套接字
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
}
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;
// 若是监听套接字
if (sk->sk_state == TCP_LISTEN) {
// 先查找监听套接字的syn_table(半连接队列),后查找ehash
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk)
goto discard;
// 若找到tcp_request_sock或tcp_sock
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
// 若都没找到(nsk == sk),继续往下走
}
TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}