nat主要在PRE_ROUTING、OUTING、LOCAL_IN、POST_ROUTING四个链上注册了hook函数,PRE_ROUTING、OUTING这个两个链上做DNAT,LOCAL_IN和POST_ROUTING链上做SNAT。nat表没有LOCAL_IN链,但在LOCAL_IN上注册了钩子函数nf_nat_fn,主要作用是修改数据包的源端口。
static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
/*做dnat*/
.hook = nf_nat_in,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
/*做snat*/
.hook = nf_nat_out,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
/* Before packet filtering, change destination */
{
/*做dnat*/
.hook = nf_nat_local_fn,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
/* After packet filtering, change source */
{
.hook = nf_nat_fn,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
1、nf_nat_in
nf_nat_in钩子函数注册在PRE_ROUTING链上,最终做DNAT的处理函数是nf_nat_fn,这个函数后面再讲,做了DNAT后目的地址改变而且数据包没有被扔掉就调用skb_dst_drop,这个最终调用dst_release,将skb->dst设置为NULL,将skb的dst_entry减1
static unsigned int
nf_nat_in(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
/*最终做dnat的处理函数*/
ret = nf_nat_fn(hooknum, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
daddr != ip_hdr(skb)->daddr)
/*目的地地址改变要将skb->dst设置为NULL*/
skb_dst_drop(skb);
return ret;
}
2、nf_nat_out
nf_nat_out注册在POST_ROUTING链上,实现的功能是做SNAT,最终处理的函数也是nf_nat_fn。
static unsigned int
nf_nat_out(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
#ifdef CONFIG_XFRM
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
#endif
unsigned int ret;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
/*做SNAT*/
ret = nf_nat_fn(hooknum, skb, in, out, okfn);
#ifdef CONFIG_XFRM
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if ((ct->tuplehash[dir].tuple.src.u3.ip !=
ct->tuplehash[!dir].tuple.dst.u3.ip) ||
(ct->tuplehash[dir].tuple.src.u.all !=
ct->tuplehash[!dir].tuple.dst.u.all)
)
return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
}
#endif
return ret;
}
3、nf_nat_local_fn
nf_nat_local_fn注册在OUTING链上,最终也是调用nf_nat_fn做DNAT,在OUTING链之前数据包已经做了路由选择,因为做DNAT目的地地址改变所以要调用ip_route_me_hander重新选择路由。
static unsigned int
nf_nat_local_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
/*做DNAT*/
ret = nf_nat_fn(hooknum, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
/*做DNAT后目的地址改变要重新选路由*/
if (ip_route_me_harder(skb, RTN_UNSPEC))
ret = NF_DROP;
}
#ifdef CONFIG_XFRM
else if (ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all)
if (ip_xfrm_me_harder(skb))
ret = NF_DROP;
#endif
}
return ret;
}
4、nf_nat_fn
1nf_nat_fn对数据包的连接跟踪选项的NAT只做一次,后续的数据包根据链接跟踪做NAT。nf_nat_fn主要做以下几件事
(1)判断数据包的链接跟踪是否建立,如果没有建立直接返回,如果链接跟踪没有关联nf_conn_nat也返回
(2)如果数据包状态是一个期望链接或者有reply方向,而且协议是icmp就调用nf_nat_icmp_reply_translation对imcp做nat
(3)如果数据包的状态是IP_CT_NEW,就调用nf_nat_initialized判断该数据包的链接跟踪是否已经做 NAT,如果还没有做NAT而且是LOCAL_IN链上的钩子函数,就调用alloc_null_binding修改链接跟踪reply方向
(4)调用函数nf_nat_rule_find查找nat表最后由nf_nat_packet根据链接跟踪做nat
static unsigned int
nf_nat_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
/* We never see fragments: conntrack defrags on pre-routing
and local-out, and nf_nat_out protects post-routing. */
NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
/*获取链接跟踪和数据包状态ctinfo*/
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
have dropped it. Hence it's the user's responsibilty to
packet filter it out, or implement conntrack/NAT for that
protocol. 8) --RR */
if (!ct)
return NF_ACCEPT;
/* Don't try to NAT if this packet is not conntracked */
/*不做链接跟踪的直接返回*/
if (ct == &nf_conntrack_untracked)
return NF_ACCEPT;
/*链接跟踪没有关联nf_conn_nat直接返回*/
nat = nfct_nat(ct);
if (!nat) {
/* NAT module was loaded late. */
/*链接跟踪已经确认就返回*/
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
if (nat == NULL) {
pr_debug("failed to add NAT extension\n");
return NF_ACCEPT;
}
}
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
/*对于一个期望链接或者有reply方向而且协议是
icmp就调用nf_nat_imcp_reply_translation做nat*/
if (!nf_nat_icmp_reply_translation(ct, ctinfo,
hooknum, skb))
return NF_DROP;
else
return NF_ACCEPT;
}
/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
or local packets.. */
/*判断连接跟踪是否已经做过NAT*/
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
/*如果数据包走到了LOCAL_IN链而且状态是NEW就要
做修改链接跟踪的reply方向*/
if (hooknum == NF_INET_LOCAL_IN)
/* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, hooknum);
else
/*查找nat表,判断是否已经做nat*/
ret = nf_nat_rule_find(skb, hooknum, in, out,
ct);
if (ret != NF_ACCEPT)
return ret;
} else
pr_debug("Already setup manip %s for ct %p\n",
maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
break;
default:
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
}
/*根据链接跟踪的tuple修改数据包做nat*/
return nf_nat_packet(ct, ctinfo, hooknum, skb);
}
4.1 nf_nat_initialized
nf_nat_initlized判断链接跟踪选项是否做了NAT,如做了NAT那么ct->status就会设置IPS_SRC_NAT_DONE_BIT、IPS_SRC_NAT_DONE_BIT。
static inline int nf_nat_initialized(struct nf_conn *ct,
enum nf_nat_manip_type manip)
{
if (manip == IP_NAT_MANIP_SRC)
return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
else
return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
}
4.2 alloc_null_binding
当数据包的状态是IP_CT_NEW并且是LOCAL_IN链上的就调用alloc_null_bingding对链接跟踪做NAT修改reply方向,因为LOCAL_IN是netfileter框架的的一个出口,如果这时链接跟踪没做NAT那么数据包出去就会有问题。
unsigned int
alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
Use reply in case it's already been mangled (eg local packet).
*/
__be32 ip
= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
: ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
struct nf_nat_range range
= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
/*链接跟踪做nat,修改tuple的reply方向*/
return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
4.3 nf_nat_rule_find
链接状态是IP_CTNEW、IP_CT_RELATED、IP_CT_RELATED+IP_CT_IS_REPLY,而且不是在LOCAL_IN上就调用nf_nat_rule_find查找NAT表匹配规则,找到就调用相应的target函数(ipt_snat_target或者ipt_dnat_target)实现连接跟踪项的转换。然如果没有找到就调用alloc_null_binding做链接跟踪的NAT。alloc_null_bingding实际调用的是nf_nat_setup_info,这个函数下一节再分析。
int nf_nat_rule_find(struct sk_buff *skb,
unsigned int hooknum,
const struct net_device *in,
const struct net_device *out,
struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
int ret;
/*查找nat表匹配的规则做NAT*/
ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
if (ret == NF_ACCEPT) {
/*判断有没有做NAT*/
if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
/* NUL mapping 改变tuple的reply方向*/
ret = alloc_null_binding(ct, hooknum);
}
return ret;
}
5、nf_nat_packet
当数据包的链接跟踪已经做了NAT,就调用nf_nat_packet根据链接跟踪修改数据包的ip、端口做NAT。这个函数很巧妙,此时链接跟踪已经做了NAT,就取dir的反方向的tuple,然后再对tuple中的源ip、目的ip、源端口、目的端口颠倒,得到target,最后调用manip_pkt做NAT修改数据包的ip地址和端口。
这个地方有点绕举一个例子:比如一个网关112.112.112.112,它下面的局域网有一个A设备ip是192.168.0.100,这个A设备要访问一个外网服务器地址是113.113.113.113,这样必须由做SNAT,首先链接跟踪做了SNAT后tuple如下
src | dst |
192.168.0.100 | 113.113.113.113 |
src | dst |
113.113.113.113 | 112.112.112.112 |
设备A的数据包访问服务器是orig方向:192.168.0.100 -> 113.113.113.113,调用nf_nat_packet取反也就是reply:113.113.113.113 -> 12.112.112.112再颠倒过来得到target:12.112.112.112 -> 113.113.113.113,然后将target:112.112.112.112 -> 113.113.113.113修改数据包的源Ip、目的ip完成SNAT转换。
当外部服务器有数据包reply:113.113.113->112.112.112.112,调用nf_nat_packet取反方向也就是orig 192.168.0.100->113.113.113.113再颠倒过来得到target:13.113.113.113.->192.168.0.100然后将target:113.113.113.113 -> 192.168.0.100修改数据包的源IP、目的IP。
比如110.110.110.110的网关地址要做DNAT到内部一个地址192.168..0.200,一个外网地址111.111.111.111访问网关110.110.110.110就会做DNAT到192.168.0.200,链接跟踪做DNAT后tuple如下
src | dst |
111.111.111.111 | 110.110.110.110 |
src | dst |
192.168.0.200 | 111.111.111.111 |
当外网地址访问网关也就是orig方向:111.111.111.111->110.110.110.110,调用nf_nat_packet会取相反方向的tuple也就是reply:192.168.0.200->111.111.111.111,然颠倒得到target:111.111.111.111->192.168.0.200然后修改源Ip、目的ip完成dnat转换。
192.168.0.200有回复包也就是reply方向:192.168.0.200->111.111.111.111,调用nf_nat_packet会取相反方向的tuple也就是orig:111.111.111.111->110.110.110.110,然后颠倒得到target:110.110.110.110->111.111.111.111,修改数据包的源地址、目的地地址完成reply。
所以说NAT起始就是基于链接跟踪实现的。
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
if (mtype == IP_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
else
statusbit = IPS_DST_NAT;
/* Invert if this is reply dir. */
if (dir == IP_CT_DIR_REPLY)
statusbit ^= IPS_NAT_MASK;
/* Non-atomic: these bits don't change. */
if (ct->status & statusbit) {
struct nf_conntrack_tuple target;
/* We are aiming to look like inverse of other direction. */
/*取dir的反方向的tuple,然后把该tuple的源ip、目的ip
源port、目的port颠倒过来得到target*/
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
/*根据链接跟踪的target做nat*/
if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
return NF_DROP;
}
return NF_ACCEPT;
}
6、manip_pkt
manip_pkt主要根据传进来的target和mainiptype完成三层、四层的NAT转换。先获取四层的struct nf_nat_protocol 结构体实例然后调用四层协议的manip_pkt完成四层端口的NAT,
static bool
manip_pkt(u_int16_t proto,
struct sk_buff *skb,
unsigned int iphdroff,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
struct iphdr *iph;
const struct nf_nat_protocol *p;
if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
return false;
iph = (void *)skb->data + iphdroff;
/* Manipulate protcol part. */
/* rcu_read_lock()ed by nf_hook_slow */
/*获取nat四层转换结构体实例*/
p = __nf_nat_proto_find(proto);
/*四层协议的NAT转换*/
if (!p->manip_pkt(skb, iphdroff, target, maniptype))
return false;
iph = (void *)skb->data + iphdroff;
if (maniptype == IP_NAT_MANIP_SRC) {
csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
/*snat改变源地址*/
iph->saddr = target->src.u3.ip;
} else {
/*dnat改变目的地址*/
csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
iph->daddr = target->dst.u3.ip;
}
return true;
}