<Linux Kernel>eventepoll3

现在开始看看这个大家认为最应该阻塞的函数 , 从现在开始呢我会通过源码分析一些

大家对 epoll 模糊的地方

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,

int, maxevents, int, timeout)

{

int error;

struct file *file;

struct eventpoll *ep;

/* 这个最大值你不用担心 , 你永远也不会有这个多连接 , 到那个时候不是epoll崩溃而是你的系统别的地方先崩溃 */

if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)

return -EINVAL;

/* 检查入参地址空间有效性 */

if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {

error = -EFAULT;

goto error_return;

}

/* 得到传入的file fd对应的struct file 结构*/

error = -EBADF;

file = fget(epfd);

if (!file)

goto error_return;

* 检查 file fd的 file_operations结构是不是epoll 的, 还记得anon_inode_getfd 吗

error = -EINVAL;

if (!is_file_epoll(file))

goto error_fput;

/* 把保存的 "全局"struct eventpoll 结构扣出来 */

ep = file->private_data;

/* Time to fish for events ... */

error = ep_poll(ep, events, maxevents, timeout);

error_fput:

fput(file);

error_return:

return error;

}

好了, 现在来看看 ep_poll(ep, events, maxevents, timeout); 这个亲切的函数吧.

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,

int maxevents, long timeout)

{

int res, eavail;

unsigned long flags;

long jtimeout;

wait_queue_t wait;

/*-1 其实就是永久睡眠 */

jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?

MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:

/*这里用了全局的spin 来遍历 */

spin_lock_irqsave(&ep->lock, flags);

res = 0;

/*还没有就绪事件*/

if (list_empty(&ep->rdllist)) {

* We don't have any available event to return to the caller.

* We need to sleep here, and we will be wake up by

* ep_poll_callback() when events will become available.

/*初始化*/

init_waitqueue_entry(&wait, current);

wait.flags |= WQ_FLAG_EXCLUSIVE;

__add_wait_queue(&ep->wq, &wait);

for (;;) {

/*期望接受到 ep_poll_callback() 的终端告诉我们发生了什么*/

set_current_state(TASK_INTERRUPTIBLE);

/*睡觉之前再看看就绪队列有没有货, 或者没有剩余可等时间了就跳出*/

if (!list_empty(&ep->rdllist) || !jtimeout)

break;

/*把当前线程设置为挂起状态 ,等事件发生*/

if (signal_pending(current)) {

res = -EINTR;

break;

}

/*解开中断恢复自选锁, 接受内核调度,同时计算时间剩余*/

spin_unlock_irqrestore(&ep->lock, flags);

jtimeout = schedule_timeout(jtimeout);

/*回来重新锁上继续傻等*/

spin_lock_irqsave(&ep->lock, flags);

}

/*终于有情况发生了 ,把自己从的等待队列摘下*/

__remove_wait_queue(&ep->wq, &wait);

set_current_state(TASK_RUNNING);

/*回头看看到底是不是就绪队列有事件了 */

}

/* 作者很小心啊 ...趁着归还锁之前看看TMD 到底就绪事件有没有效 */

eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;

spin_unlock_irqrestore(&ep->lock, flags);

/*没机会了...*/

/*回到用户空间的最后一次自检测, */

if (!res && eavail &&

!(res = ep_send_events(ep, events, maxevents)) && jtimeout)

goto retry;

return res;

}

接下来就看看这个 ep_send_events 到底干了什么

/*就是初始化了一个私有的 struct ep_send_events_data, 记住ep_send_events_proc

*这个回调方法中需要它*/

static int ep_send_events(struct eventpoll *ep,

struct epoll_event __user *events, int maxevents)

{

struct ep_send_events_data esed;

esed.maxevents = maxevents;

esed.events = events;

return ep_scan_ready_list(ep, ep_send_events_proc, &esed);

}

/*作者给加的注释的说明这个函数的目的是

* 扫描就绪链表(rdlist) 同时去调用 f_op->poll()函数, 要求在O(n)的复杂度内*/

static int ep_scan_ready_list(struct eventpoll *ep,
      int (*sproc)(struct eventpoll *,
   struct list_head *, void *),
      void *priv)
{
int error, pwake = 0;
unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
 
/*
 * 使用全局互斥锁 防止被 eventpoll_release_file() and epoll_ctl(). 干扰
 */
mutex_lock(&ep->mtx);
 
/*
 * Steal the ready list, and re-init the original one to the
 * empty list. Also, set ep->ovflist to NULL so that events
 * happening while looping w/out locks, are not lost. We cannot
 * have the poll callback to queue directly on ep->rdllist,
 * because we want the "sproc" callback to be able to do it
 * in a lockless way.
 */
spin_lock_irqsave(&ep->lock, flags);
/*用 ep->rdlist 去填充 txlist */
list_splice_init(&ep->rdllist, &txlist);
/*初始化ep->ovflist 为空, 下面就知道原因了*/
ep->ovflist = NULL;
spin_unlock_irqrestore(&ep->lock, flags);
 
/* 这里回调了ep_send_events_proc ,这是一个重要的函数,建议先跳到最后看一下这个函数再回来
 */
error = (*sproc)(ep, &txlist, priv);
 
spin_lock_irqsave(&ep->lock, flags);
/*
 * During the time we spent inside the "sproc" callback, some
 * other events might have been queued by the poll callback.
 * We re-insert them inside the main ready-list here.
 */
/*因为没有屏蔽中断所以在上面的回调过程中 可能发生有新的就绪事件挂到了ovflist链表 */
for (nepi = ep->ovflist; (epi = nepi) != NULL;
     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*处理一下新的  就绪file fd  ,链接到就绪链表上 */
if (!ep_is_linked(&epi->rdllink))
list_add_tail(&epi->rdllink, &ep->rdllist);
}
/*不需要它了*/
ep->ovflist = EP_UNACTIVE_PTR;
 
/*再次合并 */
list_splice(&txlist, &ep->rdllist);
 
if (!list_empty(&ep->rdllist)) {
/*唤醒 eventpoll (上层) 和 poll (底层) 的等待队列<记得吗, 这个要在归还锁之后哦>*  /
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
 
mutex_unlock(&ep->mtx);
 
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
 
return error;
}

static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
       void *priv)

{
struct ep_send_events_data *esed = priv;
int eventcnt;
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
 

/*这里的数据结构 很安全 ,还记得 回调是在一个 全局互斥锁的保护下进行的么 */
/*uevent 就是用户空间epoll_wait 传下来的 很庞大的 struct epoll_event  数组
*head 就是上面那个 用rdlist 填充的txlist, 记住返回的eventcnt 数不能大于 用户空间要求的*/
for (eventcnt = 0, uevent = esed->events;
     !list_empty(head) && eventcnt < esed->maxevents;) {
/*获取就绪队列中对应的那个 struct epitem*/
epi = list_first_entry(head, struct epitem, rdllink);
/*处理过了就把它从就绪队列中删除*/
list_del_init(&epi->rdllink);
/*调用它对应的file fd 关联的那个poll , 获取发生了的事件(那些用户关心的)*/
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
epi->event.events;
 
/*把改变了的结构 复制到用户空间*/
if (revents) {
if (__put_user(revents, &uevent->events) ||
    __put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
uevent++;
/*去掉自己用的标志位*/

if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
/*注意这里 : 如果没有设置边缘触发 , 就把就绪file fd 写回 就绪链表 等待下一次用户空*间的处理*/
else if (!(epi->event.events & EPOLLET)) {
list_add_tail(&epi->rdllink, &ep->rdllist);

}
}

<Linux Kernel>eventepoll3

猜你喜欢