禁用于任何商业用途。
msn: [email protected]
来源:http://yfydz.cublog.cn
3. 内核空间 内核版本2.6.17.11。 内核空间的代码程序包括net/netfilter/nfnetlink_queue.c和xt_NFQUEUE.c,前者是具体实现,后者 是iptables的一个目标,用来指定数据属于哪个队列。 3.1 数据结构 /* include/linux/netfilter/nfnetlink_queue.h */ // nfqueue netlink消息类型 enum nfqnl_msg_types { NFQNL_MSG_PACKET, /* packet from kernel to userspace */ NFQNL_MSG_VERDICT, /* verdict from userspace to kernel */ NFQNL_MSG_CONFIG, /* connect to a particular queue */ NFQNL_MSG_MAX }; // nfqueue netlink消息数据包头 struct nfqnl_msg_packet_hdr { u_int32_t packet_id; /* unique ID of packet in queue */ u_int16_t hw_protocol; /* hw protocol (network order) */ u_int8_t hook; /* netfilter hook */ } __attribute__ ((packed)); // nfqueue netlink消息数据包头硬件部分,MAC地址 struct nfqnl_msg_packet_hw { u_int16_t hw_addrlen; u_int16_t _pad; u_int8_t hw_addr[8]; } __attribute__ ((packed)); // nfqueue netlink消息数据包64位时间戳 struct nfqnl_msg_packet_timestamp { aligned_u64 sec; aligned_u64 usec; } __attribute__ ((packed)); // nfqueue netlink属性 enum nfqnl_attr_type {类型 NFQA_UNSPEC, NFQA_PACKET_HDR, NFQA_VERDICT_HDR, /* nfqnl_msg_verdict_hrd */ NFQA_MARK, /* u_int32_t nfmark */ NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ NFQA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ NFQA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ NFQA_HWADDR, /* nfqnl_msg_packet_hw */ NFQA_PAYLOAD, /* opaque data payload */ __NFQA_MAX }; #define NFQA_MAX (__NFQA_MAX - 1) // nfqueue netlink消息数据判定头 struct nfqnl_msg_verdict_hdr { u_int32_t verdict; u_int32_t id; } __attribute__ ((packed)); // nfqueue netlink消息配置命令类型 enum nfqnl_msg_config_cmds { NFQNL_CFG_CMD_NONE, NFQNL_CFG_CMD_BIND, NFQNL_CFG_CMD_UNBIND, NFQNL_CFG_CMD_PF_BIND, NFQNL_CFG_CMD_PF_UNBIND, }; // nfqueue netlink消息配置命令结构 struct nfqnl_msg_config_cmd { u_int8_t command; /* nfqnl_msg_config_cmds */ u_int8_t _pad; u_int16_t pf; /* AF_xxx for PF_[UN]BIND */ } __attribute__ ((packed)); // nfqueue netlink消息配置模式 enum nfqnl_config_mode { NFQNL_COPY_NONE, // 不拷贝 NFQNL_COPY_META, // 只拷贝基本信息 NFQNL_COPY_PACKET, // 拷贝整个数据包 }; // nfqueue netlink消息配置参数结构 struct nfqnl_msg_config_params { u_int32_t copy_range; u_int8_t copy_mode; /* enum nfqnl_config_mode */ } __attribute__ ((packed)); // nfqueue netlink消息配置模式 enum nfqnl_attr_config { NFQA_CFG_UNSPEC, NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ __NFQA_CFG_MAX }; #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) /* include/linux/netfilter.c */ struct nf_info { /* The ops struct which sent us to userspace. */ struct nf_hook_ops *elem; /* If we're sent to userspace, this keeps housekeeping info */ int pf; unsigned int hook; struct net_device *indev, *outdev; int (*okfn)(struct sk_buff *); }; /* net/netfilter/nfnetlink_queue.c */ // 队列项结构 struct nfqnl_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; unsigned int id; }; // 队列实例结构 struct nfqnl_instance { // HASH链表节点 struct hlist_node hlist; /* global list of queues */ atomic_t use; // 应用程序的pid int peer_pid; // 队列最大长度 unsigned int queue_maxlen; // 数据拷贝范围 unsigned int copy_range; // 当前队列元素数 unsigned int queue_total; // 队列丢包数 unsigned int queue_dropped; // 用户程序判定丢包 unsigned int queue_user_dropped; // ID序 atomic_t id_sequence; /* 'sequence' of pkt ids */ // 队列号 u_int16_t queue_num; /* number of this queue */ // 拷贝模式 u_int8_t copy_mode; spinlock_t lock; // queue entry队列 struct list_head queue_list; /* packets in queue */ }; 3.2 内核程序流程 3.2.1 系统初始化 /* net/netfilter/nfnetlink_queue.c */ static int __init nfnetlink_queue_init(void) { int i, status = -ENOMEM; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfqueue; #endif // 16个HASH链表 for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&instance_table[i]); // 登记netlink通知 netlink_register_notifier(&nfqnl_rtnl_notifier); // 登记nfnetlink子系统 status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } #ifdef CONFIG_PROC_FS // 建立/proc/net/netfilter/nfnetlink_queue文件 proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440, proc_net_netfilter); if (!proc_nfqueue) goto cleanup_subsys; proc_nfqueue->proc_fops = &nfqnl_file_ops; #endif // 登记nfqueue netlink设备通知 register_netdevice_notifier(&nfqnl_dev_notifier); return status; #ifdef CONFIG_PROC_FS cleanup_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); #endif cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); return status; } 3.2.2 // netlink通知,只是定义一个通知回调函数, 在接收到netlink套接字信息时调用 static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; // 就只处理释放事件 if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER && n->pid) { int i; /* destroy all instances for this pid */ write_lock_bh(&instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *tmp, *t2; struct nfqnl_instance *inst; struct hlist_head *head = &instance_table[i]; // 释放指定pid的所有子队列信息 hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { if (n->pid == inst->peer_pid) __instance_destroy(inst); } } write_unlock_bh(&instances_lock); } return NOTIFY_DONE; } 以下两个函数实现释放操作,实际是调用同一个函数,一个需要加锁,一个不需要 static inline void instance_destroy(struct nfqnl_instance *inst) { _instance_destroy2(inst, 1); } static inline void __instance_destroy(struct nfqnl_instance *inst) { _instance_destroy2(inst, 0); } static void _instance_destroy2(struct nfqnl_instance *inst, int lock) { /* first pull it out of the global list */ if (lock) write_lock_bh(&instances_lock); QDEBUG("removing instance %p (queuenum=%u) from hash\n", inst, inst->queue_num); // 将队列实例先从链表中移出 hlist_del(&inst->hlist); if (lock) write_unlock_bh(&instances_lock); /* then flush all pending skbs from the queue */ // 将当前队列中所有包的判定都设置DROP nfqnl_flush(inst, NF_DROP); /* and finally put the refcount */ // 释放队列实例本身 instance_put(inst); // 释放模块引用 module_put(THIS_MODULE); } 3.2.3 子系统 // 子系统定义 static struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, // NFQUEUE的ID号为3 .cb_count = NFQNL_MSG_MAX, // 3个控制块 .cb = nfqnl_cb, }; // 子系统回调控制 static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { // 接收数据包,实际没进行定义 [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .attr_count = NFQA_MAX, }, // 接收判定 [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .attr_count = NFQA_MAX, }, // 接收配置 [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .attr_count = NFQA_CFG_MAX, }, }; 3.2.3.1 // 实际没定义 static int nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) { return -ENOTSUPP; } 3.2.3.2 接收判定 该函数接收netlink套接字返回的数据包的判定结果,根据结果对包进行相关处理 static int nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) { struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict; struct nfqnl_queue_entry *entry; int err; // 判定数据包大小是否有问题 if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) { QDEBUG("bad attribute size\n"); return -EINVAL; } // 根据队列号找到队列的实例,并增加计数 queue = instance_lookup_get(queue_num); if (!queue) return -ENODEV; // 检查该队列对应的pid是否和netlink数据包中的pid匹配 if (queue->peer_pid != NETLINK_CB(skb).pid) { err = -EPERM; goto err_out_put; } // 检查是否返回了判定结果 if (!nfqa[NFQA_VERDICT_HDR-1]) { err = -EINVAL; goto err_out_put; } // 获取判定结果 vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); verdict = ntohl(vhdr->verdict); // 低16位为判定结果, 不能超过NF_MAX_VERDICT(5) if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { err = -EINVAL; goto err_out_put; } // 根据返回包的ID号在队列中找缓存具体的数据包 entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); if (entry == NULL) { err = -ENOENT; goto err_out_put; } if (nfqa[NFQA_PAYLOAD-1]) { // 返回了负载内容,说明要进行数据包的修改,如果不修改是不用返回载荷内容的 if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0) // 修改出错,丢弃数据包 verdict = NF_DROP; } // 是否修改数据包的mark值 if (nfqa[NFQA_MARK-1]) entry->skb->nfmark = ntohl(*(u_int32_t *) NFA_DATA(nfqa[NFQA_MARK-1])); // 和ip_queue一样,调用nf_reinject()重新将数据包发回netfilter进行处理 // 然后将该entry的内存释放掉 issue_verdict(entry, verdict); // 减少队列引用计数 instance_put(queue); return 0; err_out_put: instance_put(queue); return err; } 3.2.3.3 接收配置 static int nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) { struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; int ret = 0; QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); // 数据大小检查 if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) { QDEBUG("bad attribute size\n"); return -EINVAL; } // // 根据队列号找到队列的实例,并增加计数 queue = instance_lookup_get(queue_num); if (nfqa[NFQA_CFG_CMD-1]) { // 配置命令,由于可能是进行新建queue操作,所以此时的queue值可能为空 // struct nfqnl_msg_config_cmd *cmd; cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); QDEBUG("found CFG_CMD\n"); switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) return -EBUSY; // 绑定命令,就是新建一个queue和对应的pid绑定 queue = instance_create(queue_num, NETLINK_CB(skb).pid); if (!queue) return -EINVAL; break; case NFQNL_CFG_CMD_UNBIND: // 取消绑定 if (!queue) return -ENODEV; // 检查pid是否匹配 if (queue->peer_pid != NETLINK_CB(skb).pid) { ret = -EPERM; goto out_put; } // 是否队列实例 instance_destroy(queue); break; case NFQNL_CFG_CMD_PF_BIND: // 绑定协议族, 将nfqueue handler绑定到指定的协议 QDEBUG("registering queue handler for pf=%u\n", ntohs(cmd->pf)); ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh); break; case NFQNL_CFG_CMD_PF_UNBIND: // 取消协议族的绑定 QDEBUG("unregistering queue handler for pf=%u\n", ntohs(cmd->pf)); /* This is a bug and a feature. We can unregister * other handlers(!) */ ret = nf_unregister_queue_handler(ntohs(cmd->pf)); break; default: ret = -EINVAL; break; } } else { // 如果不是配置命令,检查queue是否存在,pid是否匹配 if (!queue) { QDEBUG("no config command, and no instance ENOENT\n"); ret = -ENOENT; goto out_put; } if (queue->peer_pid != NETLINK_CB(skb).pid) { QDEBUG("no config command, and wrong pid\n"); ret = -EPERM; goto out_put; } } if (nfqa[NFQA_CFG_PARAMS-1]) { // 配置参数 struct nfqnl_msg_config_params *params; if (!queue) { ret = -ENOENT; goto out_put; } params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]); // 设置数据拷贝模式 nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } out_put: // 减少引用计数 // 除了初始化函数和释放函数外,所有其他处理函数的计数增加和减少操作都是成对出现的 instance_put(queue); return ret; } 其中队列实例建立函数如下: static struct nfqnl_instance * instance_create(u_int16_t queue_num, int pid) { struct nfqnl_instance *inst; QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); write_lock_bh(&instances_lock); // // 根据队列号找到队列的实例,这里是不增加计数的 if (__instance_lookup(queue_num)) { // 理论上是不可能进入这里的 inst = NULL; QDEBUG("aborting, instance already exists\n"); goto out_unlock; } // 分配queue实例空间, 初始化参数 inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) goto out_unlock; inst->queue_num = queue_num; inst->peer_pid = pid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = 0xfffff; inst->copy_mode = NFQNL_COPY_NONE; atomic_set(&inst->id_sequence, 0); /* needs to be two, since we _put() after creation */ // 初始引用计数为2,因为nfqnl_recv_config()会释放掉一次 atomic_set(&inst->use, 2); spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) goto out_free; // 将该队列实例添加到总的队列HASH链表中 hlist_add_head(&inst->hlist, &instance_table[instance_hashfn(queue_num)]); write_unlock_bh(&instances_lock); QDEBUG("successfully created new instance\n"); return inst; out_free: kfree(inst); out_unlock: write_unlock_bh(&instances_lock); return NULL; } 其中nf_queue_handler定义如下, 主要是定义数据进入协议队列函数,这个就是数据包进入nf_queue的 进入点: static struct nf_queue_handler nfqh = { .name = "nf_queue", .outfn = &nfqnl_enqueue_packet, }; static int nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; struct nfqnl_instance *queue; struct nfqnl_queue_entry *entry; QDEBUG("entered\n"); // // 根据队列号找到队列的实例,并增加计数 queue = instance_lookup_get(queuenum); if (!queue) { QDEBUG("no queue instance matching\n"); return -EINVAL; } // 如果该子队列拷贝模式是NFQNL_COPY_NONE,出错返回 if (queue->copy_mode == NFQNL_COPY_NONE) { QDEBUG("mode COPY_NONE, aborting\n"); status = -EAGAIN; goto err_out_put; } // 分配一个队列项entry entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) { if (net_ratelimit()) printk(KERN_ERR "nf_queue: OOM in nfqnl_enqueue_packet()\n"); status = -ENOMEM; goto err_out_put; } entry->info = info; entry->skb = skb; // 数据包的ID是顺序增加的 entry->id = atomic_inc_return(&queue->id_sequence); // 构建一个netlink协议的skb包 nskb = nfqnl_build_packet_message(queue, entry, &status); if (nskb == NULL) goto err_out_free; spin_lock_bh(&queue->lock); // pid是否存在,pid为0的进程不存在 if (!queue->peer_pid) goto err_out_free_nskb; // 队列长度是否过长 if (queue->queue_total >= queue->queue_maxlen) { queue->queue_dropped++; status = -ENOSPC; if (net_ratelimit()) printk(KERN_WARNING "ip_queue: full at %d entries, " "dropping packets(s). Dropped: %d\n", queue->queue_total, queue->queue_dropped); goto err_out_free_nskb; } /* nfnetlink_unicast will either free the nskb or add it to a socket */ // 将新构造的netlink数据包发送给上层的netlink套接字 status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); if (status < 0) { queue->queue_user_dropped++; goto err_out_unlock; } // 将队列项entry放入队列 __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); // 减少队列计数 instance_put(queue); return status; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); err_out_free: kfree(entry); err_out_put: instance_put(queue); return status; } // 构造netlink数据包 static struct sk_buff * nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nfqnl_queue_entry *entry, int *errp) { unsigned char *old_tail; size_t size; size_t data_len = 0; struct sk_buff *skb; struct nfqnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; // entry info, 可得到inif,outif,hook等 struct nf_info *entinf = entry->info; // entry skb, 原始skb struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; unsigned int tmp_uint; QDEBUG("entered\n"); /* all macros expand to constant values at compile time */ // 头部固定长度 size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + NFA_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ #ifdef CONFIG_BRIDGE_NETFILTER + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ #endif + NFA_SPACE(sizeof(u_int32_t)) /* mark */ + NFA_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + NFA_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); // 数据包出网卡 outdev = entinf->outdev; spin_lock_bh(&queue->lock); switch (queue->copy_mode) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: // 这两种拷贝类型数据长度为0 data_len = 0; break; case NFQNL_COPY_PACKET: // 拷贝整个包 if (entskb->ip_summed == CHECKSUM_HW && (*errp = skb_checksum_help(entskb, outdev == NULL))) { // 校验和检查失败 spin_unlock_bh(&queue->lock); return NULL; } if (queue->copy_range == 0 // 为0表示不限制拷贝范围长度 || queue->copy_range > entskb->len) // 拷贝限制大于数据包长 // 数据长度为实际数据包长度 data_len = entskb->len; else // 数据长度为限制的拷贝长度限制 data_len = queue->copy_range; // 将data_len对齐后添加包头长度 size += NFA_SPACE(data_len); break; default: *errp = -EINVAL; spin_unlock_bh(&queue->lock); return NULL; } spin_unlock_bh(&queue->lock); // 分配skb skb = alloc_skb(size, GFP_ATOMIC); if (!skb) goto nlmsg_failure; old_tail= skb->tail; // netlink信息头放在skb的tailroom中 nlh = NLMSG_PUT(skb, 0, 0, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg)); nfmsg = NLMSG_DATA(nlh); // 协议族 nfmsg->nfgen_family = entinf->pf; // 版本 nfmsg->version = NFNETLINK_V0; // 队列号 nfmsg->res_id = htons(queue->queue_num); // 包ID号 pmsg.packet_id = htonl(entry->id); // 硬件协议 pmsg.hw_protocol = htons(entskb->protocol); // nf的hook点 pmsg.hook = entinf->hook; NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); // 数据进入网卡 indev = entinf->indev; if (indev) { tmp_uint = htonl(indev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); #else if (entinf->pf == PF_BRIDGE) { // 如果是桥协议族,填入物理网卡和进入网卡参数 /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), &tmp_uint); /* this is the bridge group "brX" */ tmp_uint = htonl(indev->br_port->br->dev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ // 填入输入网卡信息 NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); if (entskb->nf_bridge && entskb->nf_bridge->physindev) { // 如果存在桥信息和物理进入网卡信息,填入 tmp_uint = htonl(entskb->nf_bridge->physindev->ifindex ); NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), &tmp_uint); } } #endif } // 数据包发出网卡 if (outdev) { tmp_uint = htonl(outdev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER // 没定义桥模块时直接填入发出网卡信息 NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); #else if (entinf->pf == PF_BRIDGE) { // 桥协议组, 分别填入物理发出网卡和发出网卡信息 /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), &tmp_uint); /* this is the bridge group "brX" */ tmp_uint = htonl(outdev->br_port->br->dev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); } else { /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ // 填入发出网卡信息 NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) { // 如果存在桥信息和物理发出网卡信息,填入 tmp_uint = htonl(entskb->nf_bridge->physoutdev- >ifindex); NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), &tmp_uint); } } #endif } if (entskb->nfmark) { // 如果数据包MARK值不为0, 填入 tmp_uint = htonl(entskb->nfmark); NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); } if (indev && entskb->dev && entskb->dev->hard_header_parse) { // 填入输入网卡的硬件信息 struct nfqnl_msg_packet_hw phw; phw.hw_addrlen = entskb->dev->hard_header_parse(entskb, phw.hw_addr); phw.hw_addrlen = htons(phw.hw_addrlen); NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); } if (entskb->tstamp.off_sec) { // 时间戳 struct nfqnl_msg_packet_timestamp ts; ts.sec = cpu_to_be64(entskb->tstamp.off_sec); ts.usec = cpu_to_be64(entskb->tstamp.off_usec); NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); } if (data_len) { // 填入数据包长, 以struct nfattr结构方式 struct nfattr *nfa; int size = NFA_LENGTH(data_len); if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) { printk(KERN_WARNING "nf_queue: no tailroom!\n"); goto nlmsg_failure; } nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); nfa->nfa_type = NFQA_PAYLOAD; nfa->nfa_len = size; if (skb_copy_bits(entskb, 0, NFA_DATA(nfa), data_len)) BUG(); } // netlink信息长度,新tail减老的tail值 nlh->nlmsg_len = skb->tail - old_tail; return skb; nlmsg_failure: nfattr_failure: if (skb) kfree_skb(skb); *errp = -EINVAL; if (net_ratelimit()) printk(KERN_ERR "nf_queue: error creating packet message\n"); return NULL; } 3.2.4 登记nfqueue netlink设备通知 static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; // 只处理设备释放事件,如果网卡DOWN了,就会进行相关处理 /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev->ifindex); return NOTIFY_DONE; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(int ifindex) { int i; QDEBUG("entering for ifindex %u\n", ifindex); /* this only looks like we have to hold the readlock for a way too long * time, issue_verdict(), nf_reinject(), ... - but we always only * issue NF_DROP, which is processed directly in nf_reinject() */ read_lock_bh(&instances_lock); // 查找所有队列 for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *tmp; struct nfqnl_instance *inst; struct hlist_head *head = &instance_table[i]; hlist_for_each_entry(inst, tmp, head, hlist) { struct nfqnl_queue_entry *entry; while ((entry = find_dequeue_entry(inst, dev_cmp, ifindex)) != NULL) // 一旦数据包的进入或发出网卡是DOWN掉的网卡,就丢弃该数据包 issue_verdict(entry, NF_DROP); } } read_unlock_bh(&instances_lock); } // 比较设备,不论是in还是out的设备,只要和ifindex符合的就匹配成功 static int dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) { struct nf_info *entinf = entry->info; if (entinf->indev) if (entinf->indev->ifindex == ifindex) return 1; if (entinf->outdev) if (entinf->outdev->ifindex == ifindex) return 1; return 0; } 3.2.5 /proc 就是以前介绍的2.6.*中用于实现/proc只读文件的seq操作 static struct file_operations nfqnl_file_ops = { .owner = THIS_MODULE, .open = nfqnl_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int nfqnl_open(struct inode *inode, struct file *file) { struct seq_file *seq; struct iter_state *is; int ret; is = kzalloc(sizeof(*is), GFP_KERNEL); if (!is) return -ENOMEM; // 打开nfqueue netlink的顺序操作 // 文件内容就是16个HASH表中的各项的参数,最多65536项 ret = seq_open(file, &nfqnl_seq_ops); if (ret < 0) goto out_free; seq = file->private_data; seq->private = is; return ret; out_free: kfree(is); return ret; } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; // 该/proc文件中最大可能会有65536行, 每行表示一个子queue的信息 return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", inst->queue_num, inst->peer_pid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, atomic_read(&inst->id_sequence), atomic_read(&inst->use)); } 3.3 NFQUEUE目标 该目标很简单,返回一个无符号32位值,该值的生成就是提供一个16位的队列号,然后左移16位作为结果的 高16位,低16位置为NF_QUEUE(3). #define NF_VERDICT_MASK 0x0000ffff #define NF_VERDICT_BITS 16 #define NF_VERDICT_QMASK 0xffff0000 #define NF_VERDICT_QBITS 16 #define NF_QUEUE_NR(x) (((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK) | NF_QUEUE) static unsigned int target(struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, unsigned int hooknum, const struct xt_target *target, const void *targinfo, void *userinfo) { const struct xt_NFQ_info *tinfo = targinfo; return NF_QUEUE_NR(tinfo->queuenum); } 在iptables命令行就可以将指定的数据包设置为进入指定的子队列,例: iptables -A INPUT -s 1.1.1.1 -d 2.2.2.2 -j NFQUEUE --queue-num 100 将从1.1.1.1到2.2.2.2的包发送到子队列100. 3.4 NFQUEUE包处理 和正常netfilter数据包处理一样, 要进行NFQUEUE的数据包也进入nf_hook_slow()函数处理: /* net/netfilter/core.c */ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh) { ...... // 对于NFQUEUE的包,看verdict的低16位是否为NF_QUEUE } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { NFDEBUG("nf_hook: Verdict = QUEUE.\n"); // 进入nf_queue进行处理 if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; ...... /* net/netfilter/nf_queue.c */ // nf_queue()函数和以前2.4基本是相同的,从这里是看不出ip_queue和nf_queue的区别, // 每个协议族还是只有一个QUEUE的handler,但这时挂接的nf_queue的handler // 的处理函数nfqnl_enqueue_packet() /* * Any packet that leaves via this function must come back * through nf_reinject(). */ int nf_queue(struct sk_buff **skb, struct list_head *elem, int pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), unsigned int queuenum) { int status; struct nf_info *info; #ifdef CONFIG_BRIDGE_NETFILTER struct net_device *physindev = NULL; struct net_device *physoutdev = NULL; #endif struct nf_afinfo *afinfo; /* QUEUE == DROP if noone is waiting, to be safe. */ read_lock(&queue_handler_lock); if (!queue_handler[pf]) { read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; } afinfo = nf_get_afinfo(pf); if (!afinfo) { read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; } info = kmalloc(sizeof(*info) + afinfo->route_key_size, GFP_ATOMIC); if (!info) { if (net_ratelimit()) printk(KERN_ERR "OOM queueing packet %p\n", *skb); read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; } *info = (struct nf_info) { (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; /* If it's going away, ignore hook. */ if (!try_module_get(info->elem->owner)) { read_unlock(&queue_handler_lock); kfree(info); return 0; } /* Bump dev refs so they don't vanish while packet is out */ if (indev) dev_hold(indev); if (outdev) dev_hold(outdev); #ifdef CONFIG_BRIDGE_NETFILTER if ((*skb)->nf_bridge) { physindev = (*skb)->nf_bridge->physindev; if (physindev) dev_hold(physindev); physoutdev = (*skb)->nf_bridge->physoutdev; if (physoutdev) dev_hold(physoutdev); } #endif afinfo->saveroute(*skb, info); status = queue_handler[pf]->outfn(*skb, info, queuenum, queue_handler[pf]->data); read_unlock(&queue_handler_lock); if (status < 0) { /* James M doesn't say fuck enough. */ if (indev) dev_put(indev); if (outdev) dev_put(outdev); #ifdef CONFIG_BRIDGE_NETFILTER if (physindev) dev_put(physindev); if (physoutdev) dev_put(physoutdev); #endif module_put(info->elem->owner); kfree(info); kfree_skb(*skb); return 1; } return 1; } 4. 结论 nf_queue扩展了ip_queue的功能,使用类似802.1qVLAN的技术,将数据包打上不同的“标签”使之归到不同的队列,而不再象ip_queue那样只支持一个队列,这样就可以使最多65536个应用程序接收内核数据包,从而分别进行更仔细的分类处理。