select, epoll的个人总结

参考链接

https://www.cnblogs.com/lojunren/p/3856290.html

上面的文章中有几个错误:1.单个poll所能监听的文件描述符的大小是有限制的, 因为

typedef unsigned long int nfds_t;

select 的参考链接

https://www.cnblogs.com/jaydenhpj/p/5121030.html

https://blog.csdn.net/diaozhiwa5526/article/details/102152201(简单版)

线程被select阻塞

int select(int maxfdp,fd_set *readfds,fd_set *writefds,fd_set *errorfds,struct timeval *timeout);

maxfdp:最大文件描述符编号+1

2.select的添加和返回判断的fd是一个东西吗?

添加的时候,会将内部的一个数组置位
返回判断的时候,将readset和内部的数组进行比较

3.select什么时候被唤醒

当文件描述符就绪的时候,调用select的线程将会从设备的等待队列中唤醒,

然后do_select线程会检查就绪的文件描述符,然后将内核态的fd_set中对应的位 置位

之后asm_select会将内核态的fd_set拷贝到用户内存

4.唤醒后会返回到用户态的select()

源码解析:(linux2.6.11)

asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 fd_set __user *exp, struct timeval __user *tvp)
{
    fd_set_bits fds;
    char *bits;
    long timeout;
    int ret, size, max_fdset;

    timeout = MAX_SCHEDULE_TIMEOUT;
    if (tvp) {
        time_t sec, usec;

        if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
            || (ret = __get_user(sec, &tvp->tv_sec))
            || (ret = __get_user(usec, &tvp->tv_usec)))
            goto out_nofds;

        ret = -EINVAL;
        if (sec < 0 || usec < 0)
            goto out_nofds;

        if ((unsigned long) sec < MAX_SELECT_SECONDS) {
            timeout = ROUND_UP(usec, 1000000/HZ);
            timeout += sec * (unsigned long) HZ;
        }
    }

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fdset can increase, so grab it once to avoid race */
    max_fdset = current->files->max_fdset;
    if (n > max_fdset)
        n = max_fdset;

    ret = -ENOMEM;
    size = FDS_BYTES(n);
    bits = select_bits_alloc(size);
    if (!bits)
        goto out_nofds;
    fds.in      = (unsigned long *)  bits;
    fds.out     = (unsigned long *) (bits +   size);
    fds.ex      = (unsigned long *) (bits + 2*size);
    fds.res_in  = (unsigned long *) (bits + 3*size);
    fds.res_out = (unsigned long *) (bits + 4*size);
    fds.res_ex  = (unsigned long *) (bits + 5*size);

  /* 将所有关心的fd的读、写、异常位从用户态复制到内核态 */
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

  /* 主要函数 */
    ret = do_select(n, &fds, &timeout);

    if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
        time_t sec = 0, usec = 0;
        if (timeout) {
            sec = timeout / HZ;
            usec = timeout % HZ;
            usec *= (1000000/HZ);
        }
        put_user(sec, &tvp->tv_sec);
        put_user(usec, &tvp->tv_usec);
    }

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    select_bits_free(bits, size);
out_nofds:
    return ret;
}

sys_select 负责将用户态数据拷贝到内核态,

返回后将内核态数据拷贝到用户态

do_select

int do_select(int n, fd_set_bits *fds, long *timeout)
{
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i;
    long __timeout = *timeout;

    spin_lock(&current->files->file_lock);
    retval = max_select_fd(n, fds);
    spin_unlock(&current->files->file_lock);

    if (retval < 0)
        return retval;
    n = retval;

    poll_initwait(&table);
    wait = &table.pt;
    if (!__timeout)
        wait = NULL;
    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

        /* 设置当前的进程状态为可中断睡眠状态,但是当前进程还没有被调度出去 */
        set_current_state(TASK_INTERRUPTIBLE);

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            struct file_operations *f_op = NULL;
            struct file *file = NULL;

            /* 这里要跳过一些并没有关心的bit位,浪费了时间 */
            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            }

            /* 循环遍历所有关注的bit 位*/
            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                file = fget(i);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op && f_op->poll)
                        /* 调用poll函数,将当前进程挂上等待队列,
以及设置唤醒函数(驱动收到数据时会调用唤醒函数唤醒进程)。
并获取当前关心的fd的可读、可写、异常情况
                          (套接字的sock_poll 初始化在socket_file_ops)*/
                        mask = (*f_op->poll)(file, retval ? NULL : wait);
                    fput(file);
                    /* 表示可读 */
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                    }
                    /* 表示可写 */
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                    }
                    /* 表示异常 */
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                    }
                }
                /**
                * 如果有必要,就重新调度进程
                */
                cond_resched();
            }
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
        }
        /* 遍历完后,检查retval,看是否有可读可写异常,如果有retval不为0,那么则退出死循环 */
        wait = NULL;
        if (retval || !__timeout || signal_pending(current))
            break;
        if(table.error) {
            retval = table.error;
            break;
        }
        /* 如果上面没有检查到关心的bit位有可读可写异常。
如果调用select时设置的是无限等待,
          那么下面函数会进行进程调度,将当前进程调度出去。
驱动收到数据时会调换用poll函数设置的唤醒函数,
来唤醒当前进程对关心的bit位进行重新检查*/
        __timeout = schedule_timeout(__timeout);
    } // 这个括号包括了整个for循环
    __set_current_state(TASK_RUNNING);

    poll_freewait(&table);

    /*
     * Up-to-date the caller timeout.
     */
    *timeout = __timeout;
    return retval;
}

如果第一次检测发现一个文件描述符没有就绪,则将调用select的进程的文件描述符插入设备的等待队列中,

如果都没有就绪,则阻塞。

其中任何一个就绪的时候,进程将被唤醒,并且重新检查所有的文件描述符。。然后把就绪的位设置一下

epoll 源码:

int epoll_create(int size);

https://blog.csdn.net/justlinux2010/article/details/8506940

size参数是真的没用。。。

epoll_create生成了一个eventpoll结构并将它与一个文件绑定

struct eventpoll
{
    spin_lock_t lock;            //对本数据结构的访问
    struct mutex mtx;            //防止使用时被删除
    wait_queue_head_t wq;        //sys_epoll_wait() 使用的等待队列
    wait_queue_head_t poll_wait; //file->poll()使用的等待队列
    struct list_head rdllist;    //事件满足条件的双向链表
    struct rb_root rbr;          //红黑树
    struct epitem *ovflist;      //将事件到达的fd进行链接起来发送至用户空间
}

epoll_ctl:

https://blog.csdn.net/Mr_H9527/article/details/99745659

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

当插入,删除,修改的时候,都是查找eventpoll结构的红黑树

事件的数据结构:

struct epitem
{
    struct rb_node rbn;            //用于主结构管理的红黑树
    struct list_head rdllink;       //事件就绪队列
    struct epitem *next;           //用于主结构体中的链表
    struct epoll_filefd ffd;         //每个fd生成的一个结构
    int nwait;                 
    struct list_head pwqlist;     //poll等待队列
    struct eventpoll *ep;          //该项属于哪个主结构体
    struct list_head fllink;         //链接fd对应的file链表
    struct epoll_event event;  //注册的感兴趣的事件,也就是用户空间的epoll_event
 }

重点是下面的插入函数

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    struct epitem *epi;
    struct ep_pqueue epq;

    error = -ENOMEM;
    // 分配一个epitem结构体来保存每个加入的fd
    if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL)))
        goto eexit_1;

    /* Item initialization follow here ... */
    // 初始化结构体
    ep_rb_initnode(&epi->rbn);
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->txlink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    atomic_set(&epi->usecnt, 1);
    epi->nwait = 0;

    /* Initialize the poll table using the queue callback */
    epq.epi = epi;


    // 安装poll回调函数,这一行是非常重要的。。。。
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /*
     * Attach the item to the poll hooks and get current event bits.
     * We can safely use the file* here because its usage count has
     * been increased by the caller of this function.
     */
    // 将当前item添加至poll hook中,然后获取当前event位
    revents = tfile->f_op->poll(tfile, &epq.pt);

    /*
     * We have to check if something went wrong during the poll wait queue
     * install process. Namely an allocation for a wait queue failed due
     * high memory pressure.
     */
    if (epi->nwait < 0)
        goto eexit_2;

    /* Add the current item to the list of active epoll hook for this file */
    spin_lock(&tfile->f_ep_lock);
    list_add_tail(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_ep_lock);

    /* We have to drop the new item inside our item list to keep track of it */
    write_lock_irqsave(&ep->lock, flags);

    /* Add the current item to the rb-tree */
    ep_rbtree_insert(ep, epi);

    /* If the file is already "ready" we drop it inside the ready list */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    write_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&psw, &ep->poll_wait);

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
             current, ep, tfile, fd));

    return 0;

eexit_2:
    ep_unregister_pollwait(ep, epi);

    /*
     * We need to do this because an event could have been arrived on some
     * allocated wait queue.
     */
    write_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        ep_list_del(&epi->rdllink);
    write_unlock_irqrestore(&ep->lock, flags);

    kmem_cache_free(epi_cache, epi);
eexit_1:
    return error;
}

最重要的一行是下面这行:

    // 安装poll回调函数,这一行是非常重要的。。。。
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

ep_ptable_queue_proc函数设置了等待队列的ep_poll_callback回调函数。在设备硬件数据到来时,硬件中断函数唤醒该等待队列上等待的进程时,会调用唤醒函数ep_poll_callback。这个函数会将事件添加到rdllist里面去

https://blog.csdn.net/zhaobryant/article/details/80557262,这篇博客基本涵盖了所有关于epoll的问题

下面看看epoll_wait

asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                   int maxevents, int timeout)
{
    int error;
    struct file *file;
    struct eventpoll *ep;

    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
             current, epfd, events, maxevents, timeout));

    /* The maximum number of event must be greater than zero */
    if (maxevents <= 0 || maxevents > MAX_EVENTS) // 检查maxevents参数
        return -EINVAL;

    /* Verify that the area passed by the user is writeable */
    // 检查用户空间传入的events指向的内存是否可写
    if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
        error = -EFAULT;
        goto eexit_1;
    }

    /* Get the "struct file *" for the eventpoll file */
    error = -EBADF;
    file = fget(epfd); // 获取epfd对应的eventpoll文件的file实例,file结构是在epoll_create中创建的
    if (!file)
        goto eexit_1;

    /*
     * We have to check that the file structure underneath the fd
     * the user passed to us _is_ an eventpoll file.
     */
    error = -EINVAL;
    if (!is_file_epoll(file))
        goto eexit_2;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
    ep = file->private_data;

    /* Time to fish for events ... */
    // 核心处理函数
    error = ep_poll(ep, events, maxevents, timeout);

eexit_2:
    fput(file);
eexit_1:
    DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
             current, epfd, events, maxevents, timeout, error));

    return error;
}

下面是ep_poll:

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
           int maxevents, long timeout)
{
    int res, eavail;
    unsigned long flags;
    long jtimeout;
    wait_queue_t wait;

    /*
     * Calculate the timeout by checking for the "infinite" value ( -1 )
     * and the overflow condition. The passed timeout is in milliseconds,
     * that why (t * HZ) / 1000.
     */
    jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
        MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:
    write_lock_irqsave(&ep->lock, flags);

    res = 0;
// 如果ep->rdllist是空的,就阻塞当前进程
    if (list_empty(&ep->rdllist)) {
        /*
         * We don't have any available event to return to the caller.
         * We need to sleep here, and we will be wake up by
         * ep_poll_callback() when events will become available.
         */
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&ep->wq, &wait);

        for (;;) {
            /*
             * We don't want to sleep if the ep_poll_callback() sends us
             * a wakeup in between. That's why we set the task state
             * to TASK_INTERRUPTIBLE before doing the checks.
             */
            set_current_state(TASK_INTERRUPTIBLE);
            if (!list_empty(&ep->rdllist) || !jtimeout)
                break;
            if (signal_pending(current)) {
                res = -EINTR;
                break;
            }

            write_unlock_irqrestore(&ep->lock, flags);
            jtimeout = schedule_timeout(jtimeout);
            write_lock_irqsave(&ep->lock, flags);
        }
        remove_wait_queue(&ep->wq, &wait);

        set_current_state(TASK_RUNNING);
    }

    /* Is it worth to try to dig for events ? */
    eavail = !list_empty(&ep->rdllist);

    write_unlock_irqrestore(&ep->lock, flags);

    /*
     * Try to transfer events to user space. In case we get 0 events and
     * there's still timeout left over, we go trying again in search of
     * more luck.
     */
    if (!res && eavail &&
        !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
        goto retry;

    return res;
}
  1. epoll_wait调用ep_poll,当rdlist为空(无就绪fd)时挂起当前进程,直到rdlist不空时进程才被唤醒。
  2. 文件fd状态改变(buffer由不可读变为可读或由不可写变为可写),导致相应fd上的回调函数ep_poll_callback()被调用。
  3. ep_poll_callback将相应fd对应epitem加入rdlist,导致rdlist不空,进程被唤醒,epoll_wait得以继续执行。
  4. ep_events_transfer函数将rdlist中的epitem拷贝到txlist中,并将rdlist清空。
  5. ep_send_events函数(很关键),它扫描txlist中的每个epitem,调用其关联fd对用的poll方法。此时对poll的调用仅仅是取得fd上较新的events(防止之前events被更新),之后将取得的events和相应的fd发送到用户空间(封装在struct epoll_event,从epoll_wait返回)。 (https://blog.csdn.net/zhaobryant/article/details/80557262

ET 和 LT的区别??

https://blog.csdn.net/eyucham/article/details/86502186

本质在于,LT在每次会执行蓝线,将fd重新放回rdllist,而ET不放回。。

发布了81 篇原创文章 · 获赞 4 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/m0_37313888/article/details/105183172