第一篇博客我们简单介绍了下dpvs的编译,安装和部署使用,今天我们一起来走读下dpvs的主流程。
一、main函数总体概述
main函数的代码如下
int main(int argc, char *argv[])
{
int err, nports;
portid_t pid;
struct netif_port *dev;
struct timeval tv;
char pql_conf_buf[LCORE_CONF_BUFFER_LEN];
int pql_conf_buf_len = LCORE_CONF_BUFFER_LEN;
uint32_t loop_cnt = 0;
int timer_sched_loop_interval;
/**
* add application agruments parse before EAL ones.
* use it like the following:
* ./dpvs -v
* OR
* ./dpvs -- -n 4 -l 0-11 (if you want to use eal arguments)
*/
err = parse_app_args(argc, argv);
if (err < 0) {
fprintf(stderr, "fail to parse application options\n");
exit(EXIT_FAILURE);
}
argc -= err, argv += err;
/* check if dpvs is running and remove zombie pidfile */
/*检测dpvs是否已经运行*/
if (dpvs_running(DPVS_PIDFILE)) {
fprintf(stderr, "dpvs is already running\n");
exit(EXIT_FAILURE);
}
dpvs_state_set(DPVS_STATE_INIT);
gettimeofday(&tv, NULL);
srandom(tv.tv_sec ^ tv.tv_usec ^ getpid());
/*检测numa架构的socket数是否大于dpvs设置的最大值*/
if (get_numa_nodes() > DPVS_MAX_SOCKET) {
fprintf(stderr, "DPVS_MAX_SOCKET is smaller than system numa nodes!\n");
return -1;
}
if (set_all_thread_affinity() != 0) {
fprintf(stderr, "set_all_thread_affinity failed\n");
exit(EXIT_FAILURE);
}
/*dpdk eal环境的初始化*/
err = rte_eal_init(argc, argv);
if (err < 0)
rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n");
argc -= err, argv += err;
RTE_LOG(INFO, DPVS, "dpvs version: %s, build on %s\n", DPVS_VERSION, DPVS_BUILD_DATE);
/*dpdk 时间子系统初始化*/
rte_timer_subsystem_init();
/*配置文件初始化*/
if ((err = cfgfile_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail init configuration file: %s\n",
dpvs_strerror(err));
/*添加虚拟网络设备*/
if ((err = netif_virtual_devices_add()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail add virtual devices:%s\n",
dpvs_strerror(err));
/*定时器初始化,分为作用在slave核心的定时器和全局定时器*/
if ((err = dpvs_timer_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail init timer on %s\n", dpvs_strerror(err));
/*限速功能初始化*/
if ((err = tc_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init traffic control: %s\n",
dpvs_strerror(err));
/*网卡设备的相关初始化*/
if ((err = netif_init(NULL)) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init netif: %s\n", dpvs_strerror(err));
/* Default lcore conf and port conf are used and may be changed here
* with "netif_port_conf_update" and "netif_lcore_conf_set" */
/*控制平面初始化*/
if ((err = ctrl_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init ctrl plane: %s\n",
dpvs_strerror(err));
/*限速功能的控制面初始化*/
if ((err = tc_ctrl_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init tc control plane: %s\n",
dpvs_strerror(err));
/*vlan初始化*/
if ((err = vlan_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init vlan: %s\n", dpvs_strerror(err));
/*轻量级的ip协议栈,包括ip,arp,icmp,route,inet_addr等*/
if ((err = inet_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init inet: %s\n", dpvs_strerror(err));
/*网卡的Flow Director功能,根据不同的过滤条件,将流分入不同的队列*/
if ((err = sa_pool_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init sa_pool: %s\n", dpvs_strerror(err));
/*ip隧道初始化*/
if ((err = ip_tunnel_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init tunnel: %s\n", dpvs_strerror(err));
/**/
if ((err = dp_vs_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init ipvs: %s\n", dpvs_strerror(err));
if ((err = dpvs_firewall_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init firewall: %s\n", dpvs_strerror(err));
/*netif模块的控制面初始化*/
if ((err = netif_ctrl_init()) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "Fail to init netif_ctrl: %s\n",
dpvs_strerror(err));
/* config and start all available dpdk ports */
nports = rte_eth_dev_count();
for (pid = 0; pid < nports; pid++) {
dev = netif_port_get(pid);
if (!dev) {
RTE_LOG(WARNING, DPVS, "port %d not found\n", pid);
continue;
}
/*配置发送队列,接收队列,将网卡配置ok后,启动*/
err = netif_port_start(dev);
if (err != EDPVS_OK)
RTE_LOG(WARNING, DPVS, "Start %s failed, skipping ...\n",
dev->name);
}
/* print port-queue-lcore relation */
netif_print_lcore_conf(pql_conf_buf, &pql_conf_buf_len, true, 0);
RTE_LOG(INFO, DPVS, "\nport-queue-lcore relation array: \n%s\n",
pql_conf_buf);
/* start data plane threads */
/*启动数据面线程,用于处理从网卡上接收到的数据*/
netif_lcore_start();
/* write pid file */
if (!pidfile_write(DPVS_PIDFILE, getpid()))
goto end;
/*定时器调度的精度*/
timer_sched_loop_interval = dpvs_timer_sched_interval_get();
assert(timer_sched_loop_interval > 0);
dpvs_state_set(DPVS_STATE_NORMAL);
/* start control plane thread */
while (1) {
/* reload configuations if reload flag is set */
try_reload();
/* IPC loop */
sockopt_ctl(NULL);
/* msg loop */
msg_master_process();
/* timer */
loop_cnt++;
if (loop_cnt % timer_sched_loop_interval == 0)
rte_timer_manage();
/* kni 处理kni接口上的数据*/
kni_process_on_master();
/* process mac ring on master */
neigh_process_ring(NULL);
/* increase loop counts */
netif_update_master_loop_cnt();
}
end:
dpvs_state_set(DPVS_STATE_FINISH);
if ((err = netif_ctrl_term()) !=0 )
rte_exit(EXIT_FAILURE, "Fail to term netif_ctrl: %s\n",
dpvs_strerror(err));
if ((err = dp_vs_term()) != EDPVS_OK)
RTE_LOG(ERR, DPVS, "Fail to term ipvs: %s\n", dpvs_strerror(err));
if ((err = ip_tunnel_term()) != EDPVS_OK)
RTE_LOG(ERR, DPVS, "Fail to term tunnel: %s\n", dpvs_strerror(err));
if ((err = sa_pool_term()) != EDPVS_OK)
RTE_LOG(ERR, DPVS, "Fail to term sa_pool: %s\n", dpvs_strerror(err));
if ((err = inet_term()) != EDPVS_OK)
RTE_LOG(ERR, DPVS, "Fail to term inet: %s\n", dpvs_strerror(err));
if ((err = dpvs_timer_term()) != EDPVS_OK)
RTE_LOG(ERR, DPVS, "Fail to term timer: %s\n", dpvs_strerror(err));
if ((err = ctrl_term()) != 0)
RTE_LOG(ERR, DPVS, "Fail to term ctrl plane\n");
if ((err = netif_term()) != 0)
RTE_LOG(ERR, DPVS, "Fail to term route\n");
if ((err = cfgfile_term()) != 0)
RTE_LOG(ERR, DPVS, "Fail to term configuration file: %s\n",
dpvs_strerror(err));
pidfile_rm(DPVS_PIDFILE);
exit(0);
}
parse_app_args():解析应用程序的参数,可自行添加代码
set_all_thread_affinity():将main线程设置成系统最终的状态,即可被调度到每个cpu核心上
rte_eal_init():dpdk eal环境抽象层的初始化
rte_timer_subsystem_init():时间子系统的初始化,想使用dpdk的定时器,必须调用此函数,老流弊了!
cfgfile_init():配置文件的初始化,从配置文件中读取配置,这个我后续单开一篇博客来讲解dpvs的配置文件的设计
dpvs_timer_init():定时器初始化,分为每个slave核心的定时器 和全局定时器
tc_init():限速功能初始化
netif_init():网卡设备的相关初始化
ctrl_init():控制平面初始化
tc_ctrl_init():限速功能的控制面初始化
vlan_init():vlan初始化
inet_init():轻量级的ip协议栈,包括ip,arp,icmp,route,inet_addr等
sa_pool_init():网卡的Flow Director功能,根据不同的过滤条件,将流分入不同的队列
ip_tunnel_init():ip隧道初始化,ip隧道也是ddos防护的一种方式
dp_vs_init():dpvs内部的初始化,udp,tcp,icmp协议的初始化,连接表connlist的初始化,黑名单的初始化等等。
netif_ctrl_init():netif模块的控制面初始化
/* config and start all available dpdk ports */
nports = rte_eth_dev_count();
for (pid = 0; pid < nports; pid++) {
dev = netif_port_get(pid);
if (!dev) {
RTE_LOG(WARNING, DPVS, "port %d not found\n", pid);
continue;
}
/*配置发送队列,接收队列,将网卡配置ok后,启动*/
err = netif_port_start(dev);
if (err != EDPVS_OK)
RTE_LOG(WARNING, DPVS, "Start %s failed, skipping ...\n",
dev->name);
}
调用rte_eth_dev_count()获取系统可用的端口数,在netif_port_start(dev)函数,流程如下
1. rte_eth_dev_configure设置网卡的属性,如发送队列数,接收队列中,rte_eth_conf结构体
2. 创建发送队列,创建接收队列
3. netif_print_port_conf打印conf配置信息,防止手误配置出错
4.build_port_queue_lcore_map函数是建立端口-队列-cpu逻辑核心三者之间的映射关系。
5.rte_eth_dev_start函数来启动设备
6.获取网卡的启动状态。rte_eth_link_get_nowait函数用于获取链路层的状态,非等待版本。
7.如果开启了混杂模式,则调用rte_eth_promiscuous_enable函数开启混杂模式。
将重点介绍下面几个模块的初始化过程
二、netif_init模块初始化
int netif_init(const struct rte_eth_conf *conf)
{
/*每秒运行的cpu时钟周期*/
cycles_per_sec = rte_get_timer_hz();
/*根据numa socket创建mbuf内存池,用于接收数据包*/
netif_pktmbuf_pool_init();
/*创建用于接收arp数据包的队列*/
netif_arp_ring_init();
/*存储packet类型的链表pkt_type_tab的初始化*/
netif_pkt_type_tab_init();
/*初始化任务聊表netif_lcore_jobs*/
netif_lcore_jobs_init();
// use default port conf if conf=NULL
// 如果conf参数为NULL,则使用默认的端口配置
netif_port_init(conf);
/*重点讲解的*/
netif_lcore_init();
return EDPVS_OK;
}
重点讲解下netif_lcore_init函数
首先check_lcore_conf函数,检测下conf配置是否正确
/* register lcore jobs*/
snprintf(netif_jobs[0].name, sizeof(netif_jobs[0].name) - 1, "%s", "recv_fwd");
netif_jobs[0].func = lcore_job_recv_fwd;
netif_jobs[0].data = NULL;
netif_jobs[0].type = NETIF_LCORE_JOB_LOOP;
snprintf(netif_jobs[1].name, sizeof(netif_jobs[1].name) - 1, "%s", "xmit");
netif_jobs[1].func = lcore_job_xmit;
netif_jobs[1].data = NULL;
netif_jobs[1].type = NETIF_LCORE_JOB_LOOP;
snprintf(netif_jobs[2].name, sizeof(netif_jobs[2].name) - 1, "%s", "timer_manage");
netif_jobs[2].func = lcore_job_timer_manage;
netif_jobs[2].data = NULL;
netif_jobs[2].type = NETIF_LCORE_JOB_LOOP;
for (ii = 0; ii < NETIF_JOB_COUNT; ii++) {
res = netif_lcore_loop_job_register(&netif_jobs[ii]);
if (res < 0) {
rte_exit(EXIT_FAILURE,
"[%s] Fail to register netif lcore jobs, exiting ...\n", __func__);
break;
}
}
在每个cpu逻辑核心上注册job任务来执行,关于这块的详细分析请参考连接
https://blog.csdn.net/haolipengzhanshen/article/details/82414350
三、ctrl_init控制模块初始化
int ctrl_init(void)
{
int ret;
/*初始化多播等待队列的读写锁*/
rte_rwlock_init(&mc_wait_lock);
/*初始化消息相关的资源*/
ret = msg_init();
if (unlikely(ret < 0)) {
RTE_LOG(ERR, MSGMGR, "%s: msg module initialization failed!\n", __func__);
return ret;
}
/*sockopt控制面的初始化*/
ret = sockopt_init();
if (unlikely(ret < 0)) {
RTE_LOG(ERR, MSGMGR, "%s: sockopt module initialization failed!\n", __func__);
return ret;
}
return EDPVS_OK;
}
转到msg_init函数,看看具体实现
static inline int msg_init(void)
{
int ii, jj;
int ret;
char ring_name[16];
char buf[4096];
if (DPVS_MAX_LCORE > MSG_MAX_LCORE_SUPPORTED)
return EDPVS_NOTSUPP;
/* lcore mask init */
slave_lcore_mask = 0;
slave_lcore_nb = 0;
master_lcore = rte_get_master_lcore();
/*获取配置的slava lcore的个数和掩码*/
netif_get_slave_lcores(&slave_lcore_nb, &slave_lcore_mask);
if (slave_lcore_nb > 64) {
RTE_LOG(ERR, MSGMGR, "%s: only %d lcores supported for ctrl\n", __func__, 64);
return EDPVS_INVAL;
}
/* per-lcore msg type array init */
/*初始化mt_array和mt_lock二维数组*/
for (ii = 0; ii < DPVS_MAX_LCORE; ii++) {
for (jj = 0; jj < DPVS_MSG_LEN; jj++) {
INIT_LIST_HEAD(&mt_array[ii][jj]);
rte_rwlock_init(&mt_lock[ii][jj]);
}
}
/* multicast queue init */
mc_wait_list.free_cnt = msg_mc_qlen;
INIT_LIST_HEAD(&mc_wait_list.list);
/* per-lcore msg queue */
/*创建每个cpu核心对应的消息队列*/
for (ii = 0; ii < DPVS_MAX_LCORE; ii++) {
snprintf(ring_name, sizeof(ring_name), "msg_ring_%d", ii);
msg_ring[ii] = rte_ring_create(ring_name, msg_ring_size,
rte_socket_id(), 0/*RING_F_SC_DEQ*/);
if (unlikely(NULL == msg_ring[ii])) {
RTE_LOG(ERR, MSGMGR, "Fail to init ctrl !\n");
return EDPVS_DPDKAPIFAIL;
}
}
/* register netif-lcore-loop-job for Slaves */
//注册job到lcore核心上
snprintf(ctrl_lcore_job.name, sizeof(ctrl_lcore_job.name) - 1, "%s", "slave_ctrl_plane");
ctrl_lcore_job.func = slave_lcore_loop_func;
ctrl_lcore_job.data = NULL;
ctrl_lcore_job.type = NETIF_LCORE_JOB_LOOP;
if ((ret = netif_lcore_loop_job_register(&ctrl_lcore_job)) < 0) {
RTE_LOG(ERR, MSGMGR, "%s: fail to register ctrl func on slave lcores\n", __func__);
return ret;
}
/* register built-in msg type */
register_built_in_msg();
msg_type_table_print(buf, sizeof(buf));
RTE_LOG(INFO, MSGMGR, "%s: built-in msg registered:\n%s\n", __func__, buf);
return EDPVS_OK;
}
1)初始化mt_array二维数组,用于存储不同lcore核心上的不同消息类型的dpvs_msg_type结构体元素,一维下标是cpu逻辑核心lcoreid,二维下标是消息类型。
2)初始化mt_lock二维数组,是给mt_array加锁用的,和mt_array一一对应。
3)初始化多播等待队列mc_wait_list
4)为每个lcore核心创建队列,用于接收消息
5)将ctrl_lcore_job结构体注册到系统中,在netif_loop中注册的回调函数会被调用。
6)register_built_in_msg为注册内置消息类型。消息类型有MSG_TYPE_REG和MSG_TYPE_UNREG,对应的回调函数是msg_type_reg_cb和msg_type_unreg_cb
四、inet_init轻量级ip-stack模块初始化
int inet_init(void)
{
int err;
/*arp初始化*/
if ((err = neigh_init()) != 0)
return err;
/*路由功能初始化*/
if ((err = route_init()) != 0)
return err;
/*ip协议初始化*/
if ((err = ipv4_init()) != 0)
return err;
/*icmp协议初始化*/
if ((err = icmp_init()) != 0)
return err;
//网络地址初始化
if ((err = inet_addr_init()) != 0)
return err;
return EDPVS_OK;
}
看上图,下面分别介绍inet_init中每个模块的初始化
4.1 arp初始化
static int arp_init(void)
{
int i, j;
int err;
uint64_t lcore_mask;
lcoreid_t cid;
for (i = 0; i < DPVS_MAX_LCORE; i++) {
for (j = 0; j < ARP_TAB_SIZE; j++) {
INIT_LIST_HEAD(&neigh_table[i][j]);
}
}
/*choose one core to sync master*/
//选择一个cpu核心和master核心同步消息
netif_get_slave_lcores(NULL, &lcore_mask);
for (cid = 0 ; cid < DPVS_MAX_LCORE; cid++) {
if (lcore_mask & (1L << cid)) {
g_cid = cid;
break;
}
}
master_cid = rte_lcore_id();
arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
/*注册arp类型数据包处理函数*/
if ((err = netif_register_pkt(&arp_pkt_type)) != EDPVS_OK)
return err;
/*注册控制平面的get和set回调函数*/
if ((err = sockopt_register(&neigh_sockopts)) != EDPVS_OK)
return err;
/*链路层队列初始化*/
neigh_ring_init();
/*get static arp entry from master*/
/*从master获取静态的arp表,回调函数是neigh_process_ring*/
snprintf(neigh_sync_job.name, sizeof(neigh_sync_job.name) - 1, "%s", "neigh_sync");
neigh_sync_job.func = neigh_process_ring;
neigh_sync_job.data = NULL;
neigh_sync_job.type = NETIF_LCORE_JOB_SLOW;
neigh_sync_job.skip_loops = NEIGH_PROCESS_MAC_RING_INTERVAL;
err = netif_lcore_loop_job_register(&neigh_sync_job);
if (err != EDPVS_OK)
return err;
return EDPVS_OK;
}
4.2 路由功能初始化
4.3 ip协议初始化
4.4 icmp协议初始化
4.5 网络地址初始化