select系统调用源码分析

select系统调用

主要对select调用过程进行一些总结，后边也会有poll和epoll相关的一系列分析。

对于分析系统调用select的源码版本是2.6.25。

select系统调用的用途是：在一段指定时间内，监听用户感兴趣的文件描述符上的可读、可写和异常等事件。

1. select系统调用的原型如下：

#include <sys/select.h>
int select( int nfds, fd_set* readfds, fd_set* writefds, fd_set* exceptfds, struct timeval* timeout );

主要调用流程如下：

2. select流程

2.1 sys_select

asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
						   fd_set __user *exp, struct timeval __user *tvp)
{
	s64 timeout = -1;
	struct timeval tv;
	int ret;

	if (tvp) {    /*如果有超时值*/
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		if (tv.tv_sec < 0 || tv.tv_usec < 0)    /*时间无效*/
			return -EINVAL;

		/* Cast to u64 to make GCC stop complaining */
		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
			timeout = -1;	    /* 无限等待*/
		else {
			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
			timeout += tv.tv_sec * HZ;\		}
	}

	/*主要工作都在core_sys_select中做了*/
	ret = core_sys_select(n, inp, outp, exp, &timeout);

	if (tvp) {/*如果有超时值*/
		struct timeval rtv;

		if (current->personality & STICKY_TIMEOUTS)/*模拟bug的一个机制,不详细描述*/
			goto sticky;
		/*rtv中是剩余的时间*/
		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
		rtv.tv_sec = timeout;
		if (timeval_compare(&rtv, &tv) >= 0)/*如果core_sys_select超时返回,更新时间*/
			rtv = tv;
		/*拷贝更新后的时间到用户空间*/
		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
sticky:
		if (ret == -ERESTARTNOHAND)    /*ERESTARTNOHAND表明,被中断的系统调用*/
			ret = -EINTR;
		}
	}

	return ret;
}

2.2 core_sys_select

在28行处调用core_sys_select函数，然后处理三个fd_set参数，调用do_select，将返回的结果集，返回到用户空间。

首先，fd_set结构体仅包含一个整型数组，该数组的每一位（bit）标记一个文件描述符。fd_set能容纳的文件描述符数量由FD_SETSIZE指定，这也就限制了select能同事处理的文件描述符的总量。

采用一系列宏来访问fd_set结构体中的位：

#include <sys/select.h>

FD_ZERO ( fd_set *fdset );                    /* 清除fdset的所有位 */
FD_CLR ( int fd, fd_set *fdset);             /* 清除位fd */
FD_SET ( int fd, fd_set *fdset );            /* 设置fdset的位fd*/
int FD_ISSET ( int fd, fd_set *fdset);       /* 测试fdset的位fd是否被设置过*/

然后，来看core_sys_select，

static int  core_sys_select（int  n，fd_set __user * inp，fd_set __user * outp，   
                           fd_set __user * exp，s64 * timeout）  
{  
    fd_set_bits fds;  
    void  * bits;  
    int  ret，max_fds;  
    struct  fdtable * fdt;  
    ······（省略部分）
    / * get_fd_set仅仅调用调用copy_from_user从用户空间拷贝了FD_SET * /  
    if  （（ret = get_fd_set（n，inp，fds.in））||  
        （ret = get_fd_set（n，outp，fds.out））||  
        （ret = get_fd_set（n，exp，fds.ex）））  
         goto out;  
  
    zero_fd_set（n，fds.res_in）;  //清零
    zero_fd_set（n，fds.res_out）;  
    zero_fd_set（n，fds.res_ex）;  
  
    ret = do_select（n，＆fds，timeout）;  
     ......
     ......
    / *把结果集，拷贝回用户空间* /  
    if  （set_fd_set（n，inp，fds.res_in）||  
        set_fd_set（n，outp，fds.res_out）||  
        set_fd_set（n，exp，fds.res_ex））  
        ret = -EFAULT;  
  
out：  
    if（bits！= stack_fds）  
        kfree（bits）; / *对应上面的kmalloc的* /  
out_nofds：  
    return ret;  
}

跟着core_sys_select函数调用树来进行分析，

do_select是select中比较重要的函数，

int do_select(int n, fd_set_bits *fds, long *timeout)
{
	struct poll_wqueues table;
	poll_table *wait;
	int retval, i;
	long __timeout = *timeout;

 	spin_lock(¤t->files->file_lock);
	retval = max_select_fd(n, fds); /////
	spin_unlock(¤t->files->file_lock);

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);    /////
	wait = &table.pt;
	if (!__timeout)
		wait = NULL;
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

		set_current_state(TASK_INTERRUPTIBLE);    /*设置当前进程状态；可中断的睡眠状态*/

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {    /* 遍历所有的fd*/
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {    
				i += __NFDBITS; /* __NFDBITS 定义为（8*sizeof(unsigned long)),

                                                *  即long的位数，因为一个long代表了__NFDBITS位，所以跳到跳到下一位图i要增加__NFDBITS*/   
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
				file = fget(i);    /*==getfiel()根据进程文件描述符获得文件对象的地址，并增加引用计数*/
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						mask = (*f_op->poll)(file, retval ? NULL : wait);
					fput(file);    /*释放对file的引用，即减少引用计数f_count */
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
				cond_resched();
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
		}
		wait = NULL;
		if (retval || !__timeout || signal_pending(current))
			break;
		if(table.error) {
			retval = table.error;
			break;
		}
		__timeout = schedule_timeout(__timeout);    /* 当前进程睡眠timeout个 jiffies*/
	}
	__set_current_state(TASK_RUNNING);     /* 设置为运行状态*/

	poll_freewait(&table);        /*   清理等待队列*/
	*timeout = __timeout;
	return retval;
}

其中，

static int max_select_fd(unsigned long n, fd_set_bits *fds);

max_select_fd是返回在fd_set中已经打开的、并且小于用户指定最大值的fd.

poll_initwait(&table)调用，将当前进程放进自己的等待队列table，并将该等待队列加入到该测试表wait中。

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
	pwq->error = 0;
	pwq->table = NULL;
}

其中init_poll_funcptr函数等同于 &(pwq)->pt->__pollwait = __pollwait;

socket描述符，f_op->poll对应的函数是sock_poll,

函数中第三个参数wait是等待队列，在poll成功之后会将本进程唤醒并执行。

static unsigned int sock_poll(struct file *file, poll_table *wait)
{
	struct socket *sock;

	/*约定socket的file->private_data字段放着对应的socket结构指针*/
	sock = file->private_data;

	return sock->ops->poll(file, sock, wait);
}

poll方法是poll、epoll和select这三个系统调用的后端实现。这三个系统调用可用来查询某个或多个文件描述符上的读取或写入是否会被阻塞。

poll方法应该返回一个位掩码，用来指出非阻塞的读取或写入是否可能。并且也会向内核提供将调用进程置于休眠状态直到IO变为可能时的信息。
如果 驱动程序将poll方法定义为NULL，则设备会被认为既可读也可写，并且不会阻塞。

对于do_select函数中cond_resched()调用，如果有必要，就重新调度。

在cond_resched之前，进行set_current_state(TASK_INTERRUPTIBLE)，因为已经进入TASK_INTERRUPTIBLE状态，所以cond_resched会调度其他进程来运行。

这里的cond_resched的目的纯粹是为了增加一个抢占点。被强占后，由等待队列机制唤醒。

另外，在支持抢占式调度的内核中（定义了CONFIG_PREEMPT），cond_reched是空操作。

signal_pending(current)表示如果进程描述符所表示的进程有非阻塞的挂起信号，就返回1；否则返回0.

该函数只是通过检查进程的TIF_SIGPENDING标志。

static inline int signal_pending(struct task_struct *p) 
/* 表示目前进程有信号需要处理

内部调用的是，

unlikely(test_tsk thread_flag(p, TIF_SIGPENDING))*/

3. select模型的特点：

（1）可监控的文件描述符个数取决于sizeof(fd_set)的值。

（2）将fd加入select监控集的同时，还要使用一个数据结构array保存放到select监控集中的fd,

一是用于在select返回后，array作为源数据和fd_set进行FD_ISSET判断；（FD_ISSET判断是否有事件发送）

二是select返回后会把之前加入的但无事件的同时取得fd最大值maxfd，用于select的第一个参数。

（3）select模型必须在select前循环array（进行加fd，取maxfd），select返回后循环array。

（4）由于每次调用都返回整个用户注册的事件集合（包含就绪的和未就绪的），所以应用程序索引文件描述符的时间复杂度为O（n）。而epoll则相对高效，会在内核中维护一个事件表，并提供了一个独立系统调用epoll_ctl来控制事件的添加、删除和修改，每次epoll_wait调用都直接从该内核事件表中取得用户注册的事件，而无须反复从用户空间读入这些事件，直接返回就绪的事件，使得应用程序索引就绪文件描述符的时间复杂度为O（1）.（下篇博客中会进行详细说明）

（5）采用轮询方式来检测就绪事件，时间复杂度为O（n）；相对epoll而言，采用回调方式检测就绪事件，时间复杂度为O（1）。

select适合于连接数量少，但活动连接较多的情况；而epoll则适用于连接数量多，但活动连接较少的情况。

4. select的缺点

（1）select的参数类型fd_set没有将文件描述符和事件绑定，仅仅是一个文件描述符集合，不能处理更多的事件；

（2）每次调用select，都需要把fd集合从用户态拷贝到内核态，这个开销在fd很多时会特别大；

（3）同时每次调用select都需要在内核遍历传递进来的所有fd，在fd很多时开销较多；

（4）select支持的文件描述符数量由FD_SETSIZE指定，默认是1024；

（5）由于内核通过对fd_set集合的在线修改，来反馈其中的就绪事件，使得应用程序每次调用select前不得不重置这3个fd_set集合（可读、可写和异常事件）

参考博客：http://zhangyafeikimi.iteye.com/blog/248815

资料书籍：《Linux高性能服务器编程》

select系统调用源码分析

select系统调用

猜你喜欢