select和poll实现及驱动支持

1. 应用层接口

1.1 select

原型：

int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);

参数：
nfds 是readfds、writefds、exceptfds中登记的文件描述符的最大值再加 1。
readfds 是监控文件非阻塞可读的描述符集合，也可以监控文件是否读到末尾（file descriptor is also ready on end-of-file）。
writefds 是监控文件非阻塞可写的描述符集合。
exceptfds 是监控文件异常的描述符集合。
timeout select超时时间，如果timeout里的值为0，则立即返回，如果该参数为NULL，则一直阻塞直到事件出现，如果设置了超时时间，但是在超时之前有事件发生，select返回，那么timeout里的值是剩余的时间。

文件描述符集合可用宏操作：

void FD_SET(int fd, fd_set *set)    将fd设置到set集合中
void FD_CLR(int fd, fd_set *set)    将fd从集合set中清除
void FD_ZERO(fd_set *set)           清空set集合
int FD_ISSET(int fd, fd_set *set)   测试set集合中fd是否被设置来判断是否可读、可写、出现异常

返回值：
当成功时，select返回readfds, writefds, exceptfds三个集合中文件可操作的文件描述符的总数，也就是三个集合中所有位的设置总数。例如readfds和writefds中设置了同一个文件，当该文件同时可读可写时select返回，那么返回值就是2。
当错误时返回负数值，根据具体返回值进行处理。
当返回值是0时，表示超时。

1.2 poll

原型：

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

参数：
fds 是一个结构体数组，结构体如下：

struct pollfd{
　　int fd;              //监控的文件描述符
　　short events;        //监控的事件，用下面的标志位标志，不能设置成最后三个错误标志位
　　short revents;       //返回的事件，用下面的标志位标志，如果没有事件发生，这个成员会被清空
};
POLLIN     普通或优先级带数据可读
POLLRDNORM 普通数据可读
POLLRDBAND 优先级带数据可读
POLLPRI    高优先级数据可读

POLLOUT    普通或优先级带数据可写
POLLWRNORM 普通数据可写
POLLWRBAND 优先级带数据可写

POLLERR    发生错误
POLLHUP    发生挂起
POLLVAL    描述字不是一个打开的文件

nfds 是第一个元素的个数
timeout 超时时间，单位是毫秒，当设置成0，立即返回，设置成-1，会一直阻塞直到有事件发生

返回值：
成功返回正整数，表示所有事件的数量，包括正常的可读写事件和错误事件；
超时返回0；
出错返回-1，检查errno得到错误原因。

2. 内核中实现

2.1 select

select在内核中的入口在fs/select.c中：

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
        fd_set __user *, exp, struct timeval __user *, tvp)
{
    struct timespec end_time, *to = NULL;
    struct timeval tv;
    int ret;

    if (tvp) {                                            //如果设置了超时时间的参数
        if (copy_from_user(&tv, tvp, sizeof(tv)))         //从用户空间读超时时间的参数 timeval结构体
            return -EFAULT;

        to = &end_time;
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))    //计算出超时的时间点，填入end_time中
            return -EINVAL;
    }

    ret = core_sys_select(n, inp, outp, exp, to);    //这里前三个参数对应select的前三个参数，最后一个参数是计算出的超时的时刻
    ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);  //计算剩余的时间

    return ret;
}

core_sys_select(n, inp, outp, exp, to)
    do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        struct poll_wqueues table;
        poll_initwait(&table);          //将__pollwait函数赋值给table.pt._qproc，
        wait = &table.pt;
        for(;;) {
            for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                mask = (*f_op->poll)(file, wait);             //调用文件操作中的poll函数
                if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;           // 文件可读标志位被设置，就把retval加1，从下面的判断可以看出，retval是这三个集合的总和
                        wait->_qproc = NULL;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }
            }
            ...
            if (retval || timed_out || signal_pending(current))        // 如果有结果，或者超时，或者有信号挂起，那么就返回
                break;
            poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)  //将进程状态设置成TASK_INTERRUPTIBLE，然后休眠
        }

再看上面这个流程中，是怎么将驱动的等待队列头加入等待队列的，又是怎么唤醒进程的。

首先在poll_initwait(&table)函数中设置了table.pt._qproc这个函数指针：

void poll_initwait(struct poll_wqueues *pwq)        //这里传入的是table变量
{
    init_poll_funcptr(&pwq->pt, __pollwait);        //这里将table.pt._qproc设置成__pollwait函数
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~0UL; /* all events enabled */
}

然后在文件操作的poll函数中调用poll_wait这个函数，实际上就是调用了上面设置的__pollwait函数：

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)           // wait_address参数是驱动定义的等待队列头， p就是在do_select中传给poll函数的struct poll_table_struct *指针，也就是指向do_select函数中的table.pt的地址的指针
{
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);      //这里调用的函数就是__pollwait函数
}

再看__pollwait函数做了什么：

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)       //这里的参数：file就是对应的文件指针，wait_address就是驱动中的等待队列头，p就是指向do_select函数中的table.pt的地址的指针
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);  //通过指针p找到p所属的struct poll_wqueues结构体的指针，这里的pwq就是指向了do_select里的table
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    entry->key = p->_key;
    init_waitqueue_func_entry(&entry->wait, pollwake);
    entry->wait.private = pwq;
    add_wait_queue(wait_address, &entry->wait);   //把该进程的等待队列项table.inline_entries[x].wait加入驱动里定义的等待队列头链表里
}

这里将等待队列项加入驱动中定义的等待队列头，但是还没有睡眠，当程序从驱动的poll返回时，如果没有可以返回的条件，这时就会调用到：
poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)
进程执行到这个函数中会休眠：

int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
              ktime_t *expires, unsigned long slack)
{
    int rc = -EINTR;

    set_current_state(state);    //这里将进程的状态设置成TASK_INTERRUPTIBLE，表示可被信号中断打断
    if (!pwq->triggered)
        rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);   //在这里将进程休眠
    __set_current_state(TASK_RUNNING);   //将进程的状态设置成TASK_RUNNING

    /*
     * Prepare for the next iteration.
     *
     * The following set_mb() serves two purposes.  First, it's
     * the counterpart rmb of the wmb in pollwake() such that data
     * written before wake up is always visible after wake up.
     * Second, the full barrier guarantees that triggered clearing
     * doesn't pass event check of the next iteration.  Note that
     * this problem doesn't exist for the first iteration as
     * add_wait_queue() has full barrier semantics.
     */
    set_mb(pwq->triggered, 0);

    return rc;
}

2.2 poll

在内核中的入口在fs/select.c中：

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        int, timeout_msecs)
{
    struct timespec end_time, *to = NULL;
    int ret;

    if (timeout_msecs >= 0) {
        to = &end_time;
        poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
            NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
    }

    ret = do_sys_poll(ufds, nfds, to);

    if (ret == -EINTR) {
        struct restart_block *restart_block;

        restart_block = &current_thread_info()->restart_block;
        restart_block->fn = do_restart_poll;
        restart_block->poll.ufds = ufds;
        restart_block->poll.nfds = nfds;

        if (timeout_msecs >= 0) {
            restart_block->poll.tv_sec = end_time.tv_sec;
            restart_block->poll.tv_nsec = end_time.tv_nsec;
            restart_block->poll.has_timeout = 1;
        } else
            restart_block->poll.has_timeout = 0;

        ret = -ERESTART_RESTARTBLOCK;
    }
    return ret;
}

do_sys_poll(ufds, nfds, to)
    poll_initwait(&table);
    fdcount = do_poll(nfds, head, &table, end_time)
        for (;;) {
            for (walk = list; walk != NULL; walk = walk->next) {
                do_pollfd(pfd, pt)
                    mask = file->f_op->poll(file, pwait);
            }
            poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)
        }

可以看出poll在内核的实现基本是和select一样的，有些不同的就是select的fd_set参数是位掩码，因此fd_set有固定的长度，默认是1024，相对于select而言，poll的fds参数不是固定长度，是可变的。

3. 驱动支持

3.1 select、poll 驱动支持

根据上面的分析，驱动中的poll要做的工作就是判断该文件是否可读、可写或者异常，如果是，那么返回对应的标志位，如果不是，则要将调用poll的进程加入到该文件驱动定义的一个等待队列头中，然后返回0。
所以驱动要想支持select大概需要这样实现poll函数：

int read_flag, write_flag, exec_flag;
static wait_queue_head_t read_queue;
static wait_queue_head_t write_queue;
static wait_queue_head_t exce_queue;
module_init()
{
    init_waitqueue_head(&read_queue);
    init_waitqueue_head(&write_queue);
    init_waitqueue_head(&exce_queue);
}
static unsigned int usb_mouse_poll(struct file *file, struct poll_table_struct *table)
{
    unsigned int mask = 0;

    poll_wait(file, &read_queue, table);
    poll_wait(file, &write_queue, table);
    poll_wait(file, &exce_queue, table);

    if(read_flag) {
        mask |= POLLIN | POLLRDNORM;                            // 可读标志位
    }
    if(write_flag) {
        mask |= POLLOUT | POLLWRNORM;                           // 可写标志位
    }
    if(exec_flag) {
        mask |= POLLPRI;
    }

    return mask;
}

实现了poll函数后，那么怎么唤醒进程呢，一般使用wake_up()或者wake_up_interruptible()，参数是驱动的等待队列头，调用这两个函数，就会唤醒挂在这个等待队列头上的所有task。