Linux2.6.9内核源码分析--eventpoll

Linux2.6.9内核源码分析–eventpoll

eventpoll

epoll分为三个系统调用:
long sys_epoll_create(int size);
long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event);
long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout);

long sys_epoll_create(int size)

参数说明:
输入参数: int size

It opens an eventpoll file descriptor by suggesting a storage of “size”
file descriptors. The size parameter is just an hint about how to size
data structures. It won’t prevent the user to store more than “size”
file descriptors inside the epoll interface. It is the kernel part of
the userspace epoll_create(2)

返回值:long

当前进程中的一个文件描述符,该文件属于event poll文件系统

该系统调用分为下列步骤:

  1. **int ep_getfd(int *efd, struct inode **einode, struct file efile)
static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
{
..........
	/* 1.通过kmem_cache_alloc(filp_cachep, GFP_KERNEL)
	        从slab 中分配一个struct file 对象*/
	file = get_empty_filp();
	if (!file)
		goto eexit_1;

	/* 2. 从epoll文件系统eventpoll_mnt中分配inode节点 */
	inode = ep_eventpoll_inode();
	error = PTR_ERR(inode);
	if (IS_ERR(inode))
		goto eexit_2;

	/*3.  从当前进程描述符struct task_struct中
	        current->files 数组中获取未使用的下标 */
	error = get_unused_fd();
	if (error < 0)
		goto eexit_3;
	fd = error;
..........
..........
	/*
	 *4.  eventpoll_mnt中分配目录项,
	        调用d_add(dentry, inode)将目录项和iNode相互挂上,
	        再初始化file内部变量,挂上目录项等,其中重点关注:
	        file->f_op = &eventpoll_fops;
	        static struct file_operations eventpoll_fops = {
	              .release	= ep_eventpoll_close,
	              .poll		= ep_eventpoll_poll  //poll_wait中会调用这个函数
            };
	 */
	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
	if (!dentry)
		goto eexit_4;
	dentry->d_op = &eventpollfs_dentry_operations;
	d_add(dentry, inode);
	file->f_vfsmnt = mntget(eventpoll_mnt);
	file->f_dentry = dentry;
	file->f_mapping = inode->i_mapping;

	file->f_pos = 0;
	file->f_flags = O_RDONLY;
	file->f_op = &eventpoll_fops;
	file->f_mode = FMODE_READ;
	file->f_version = 0;
	file->private_data = NULL;

	/* 将该file struct挂载到current->files数组中:files->fd[fd] = file */
	fd_install(fd, file);
..........
	*efd = fd;
	*einode = inode;
	*efile = file;
	return 0;
	.........
}
  1. *int ep_file_init(struct file file)
static int ep_file_init(struct file *file)
{
	struct eventpoll *ep;

	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
		return -ENOMEM;

	memset(ep, 0, sizeof(*ep));
	rwlock_init(&ep->lock);
	init_rwsem(&ep->sem);
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;

	file->private_data = ep; //后面会通过文件描述符找到struct file
	                         // 再找到该struct eventpoll 对象

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
		     current, ep));
	return 0;
}
  1. 返回文件描述符fd
return fd;

long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)

The following function implements the controller interface for
the eventpoll file that enables the insertion/removal/change of
file descriptors inside the interest set

参数说明:
int epfd: epoll文件描述符
int op: insertion/removal/change
int fd: 要监控的文件描述符,如socket对应的文件描述符
struct epoll_event __user *event:用户传下来要监控的event类型

#define POLLIN 0x0001
#define POLLPRI 0x0002
#define POLLOUT 0x0004
#define POLLERR 0x0008
#define POLLHUP 0x0010
#define POLLNVAL 0x0020

返回值:返回错误代码

epoll_ctrl主要涉及插入/删除/修改 三个动作:

	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;

			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}

其中我们来重点追踪插入动作:

int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)

参数说明:
struct eventpoll *ep:通过输入参数epfd在当钱包进程描述符中找到struct file,再通过file->private字段获取到该对象
struct epoll_event *event:输出参数传入的要监控的event
struct file *tfile:输入参数fd所对应的struct file对象
int fd:要监控的文件描述符
返回值:返回错误代码

详细流程见代码注释:

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
...............
    /*分配一个struct epitem,然后进行其内部链表初始化*/
	if (!(epi = EPI_MEM_ALLOC()))
		goto eexit_1;

	/* Item initialization follow here ... */
	/* RB-Tree node used to link this structure 
	   to the eventpoll rb-tree */
	EP_RB_INITNODE(&epi->rbn);
	/* List header used to link this structure 
	    to the eventpoll ready list */
	INIT_LIST_HEAD(&epi->rdllink);
	/* List header used to link this item
	   to the "struct file" items list */
	INIT_LIST_HEAD(&epi->fllink);
	/* List header used to link the item
	    to the transfer list */
	INIT_LIST_HEAD(&epi->txlink);
	/* List containing poll wait queues */
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;//将该epi与ep 挂上
	EP_SET_FFD(&epi->ffd, tfile, fd);
	epi->event = *event;
	atomic_set(&epi->usecnt, 1);
	epi->nwait = 0;

	/* 将ep_ptable_queue_proc函数赋值给epq.pt.qproc = qproc, */
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/* 以本地socket为例,该函数的调用关系如下:
    1.创建的socket对应的struct file的f_op为:
      file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops;
    2.static struct file_operations socket_file_ops = {
            ..........
         .poll =		sock_poll,
            ......... };
    3.static unsigned int sock_poll(struct file *file, poll_table * wait)
         {......
        sock = SOCKET_I(file->f_dentry->d_inode);
        //sock->ops = &unix_stream_ops;
        return sock->ops->poll(file, sock, wait);};
     4.static unsigned int unix_poll(struct file * file,
              struct socket  *sock, poll_table *wait) {
             ......
           poll_wait(file, sk->sk_sleep, wait);
             .......};
      5.static inline void poll_wait(struct file * filp, 
                                      wait_queue_head_t * wait_address,
                                      poll_table *p)  {
            if (p && wait_address)
	              p->qproc(filp, wait_address, p);};
	  6.static void ep_ptable_queue_proc(struct file *file,
	                                   wait_queue_head_t *whead,
			                           poll_table *pt)  {
	       ........
			if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
				init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
				pwq->whead = whead;
				pwq->base = epi;
				add_wait_queue(whead, &pwq->wait);
				list_add_tail(&pwq->llink, &epi->pwqlist);
				epi->nwait++;
			} ........
       }
       7.由此可见将eppoll_entry *pwq中的pwq->wait等待队列挂载到sk->sk_sleep中,
         而pwq->wait最终的回调函数为 ep_poll_callback,那么ep_poll_callback什么
         时候会被调用呢 ? 
         kernel/sched.c  
         static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
			     int nr_exclusive, int sync, void *key)  {
                   ......
				if (curr->func(curr, mode, sync, key) &&
				    (flags & WQ_FLAG_EXCLUSIVE) &&
				    !--nr_exclusive)
					break;
			 }
         } 
         此函数中func就是对应event poll的回调函数ep_poll_callback,
         而这个wakeup函数是在socket 读/写/退出/出错等case下都会被调用,
         从而达到监控的最终目的
       8. static int ep_poll_callback(wait_queue_t *wait, 
                                      unsigned mode, int sync,
                                       void *key)  {
              ........
	          list_add_tail(&epi->rdllink, &ep->rdllist);
	          ........
           } 
          从该callback来看,就是将当前epi即一个被加入到epoll中的文件描述符对象加
          入到ep对象的ready list中,而再epoll_wait中会将ep 的ready list中
          的epi的event返回给用户      
	 */
	revents = tfile->f_op->poll(tfile, &epq.pt);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	if (epi->nwait < 0)
		goto eexit_2;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_ep_lock);
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_ep_lock);

	/* We have to drop the new item inside our item list to keep track of it */
	write_lock_irqsave(&ep->lock, flags);

	/* 将该epi插入到ep的红黑树中,用于快速查找,如插入新的file时
	   检查该该file是否已经存在于该红黑树中,
	   remove 和change动作也是需要首先查找红黑树*/
	ep_rbtree_insert(ep, epi);

	/* If the file is already "ready" we drop it inside the ready list */
	if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}
     write_unlock_irqrestore(&ep->lock, flags);
	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait);

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
		     current, ep, tfile, fd));

	return 0;
..........................
}

long sys_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, int timeout)

 long sys_epoll_wait(int epfd, struct epoll_event __user *events,
			       int maxevents, int timeout)
{
    ..........
	ep = file->private_data;
	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);
    ........
}

1.如果ep->rdllist为空,就将当前进程挂起进入睡眠状态,当进程醒来后,rdllist中就有event了,
2.将rdllist中的event转移到txlist中,再将txlist中的event上报给用户

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res, eavail;
	unsigned long flags;
	long jtimeout;
	wait_queue_t wait;

	/*
	 * Calculate the timeout by checking for the "infinite" value ( -1 )
	 * and the overflow condition. The passed timeout is in milliseconds,
	 * that why (t * HZ) / 1000.
	 */
	jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
		MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;

retry:
	write_lock_irqsave(&ep->lock, flags);

	res = 0;
	if (list_empty(&ep->rdllist)) {
		/*
		 * We don't have any available event to return to the caller.
		 * We need to sleep here, and we will be wake up by
		 * ep_poll_callback() when events will become available.
		 */
		init_waitqueue_entry(&wait, current);
		add_wait_queue(&ep->wq, &wait);

		for (;;) {
			/*
			 * We don't want to sleep if the ep_poll_callback() sends us
			 * a wakeup in between. That's why we set the task state
			 * to TASK_INTERRUPTIBLE before doing the checks.
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			if (!list_empty(&ep->rdllist) || !jtimeout)
				break;
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			write_unlock_irqrestore(&ep->lock, flags);
			jtimeout = schedule_timeout(jtimeout);
			write_lock_irqsave(&ep->lock, flags);
		}
		remove_wait_queue(&ep->wq, &wait);

		set_current_state(TASK_RUNNING);
	}

	/* Is it worth to try to dig for events ? */
	eavail = !list_empty(&ep->rdllist);

	write_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
		goto retry;

	return res;
}

static int ep_events_transfer(struct eventpoll *ep,
			      struct epoll_event __user *events, int maxevents)
{
	int eventcnt = 0;
	struct list_head txlist;

	INIT_LIST_HEAD(&txlist);

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
	 */
	down_read(&ep->sem);

	/* Collect/extract ready items */
	if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
		/* Build result set in userspace */
		eventcnt = ep_send_events(ep, &txlist, events);

		/* Reinject ready items into the ready list */
		ep_reinject_items(ep, &txlist);
	}

	up_read(&ep->sem);

	return eventcnt;
}

Linux2.6.9内核源码分析--eventpollLinux2.6.9内核源码分析--eventpoll 年轻态程序猿 发布了1 篇原创文章 · 获赞 0 · 访问量 15 私信 关注
上一篇:PXE无人值守安装


下一篇:df、du、fdisk、lsblk区别