/*1* fs/eventpoll.c (Efficient event retrieval implementation)2* Copyright (C) 2001,...,2009 Davide Libenzi3*4* This program is free software; you can redistribute it and/or modify5* it under the terms of the GNU General Public License as published by6* the Free Software Foundation; either version 2 of the License, or7* (at your option) any later version.8*9* Davide Libenzi <[email protected]>10*11*/1213#include <linux/init.h>14#include <linux/kernel.h>15#include <linux/sched.h>16#include <linux/fs.h>17#include <linux/file.h>18#include <linux/signal.h>19#include <linux/errno.h>20#include <linux/mm.h>21#include <linux/slab.h>22#include <linux/poll.h>23#include <linux/string.h>24#include <linux/list.h>25#include <linux/hash.h>26#include <linux/spinlock.h>27#include <linux/syscalls.h>28#include <linux/rbtree.h>29#include <linux/wait.h>30#include <linux/eventpoll.h>31#include <linux/mount.h>32#include <linux/bitops.h>33#include <linux/mutex.h>34#include <linux/anon_inodes.h>35#include <asm/uaccess.h>36#include <asm/system.h>37#include <asm/io.h>38#include <asm/mman.h>39#include <asm/atomic.h>4041/*42* LOCKING:43* There are three level of locking required by epoll :44*45* 1) epmutex (mutex)46* 2) ep->mtx (mutex)47* 3) ep->lock (spinlock)48*49* The acquire order is the one listed above, from 1 to 3.50* We need a spinlock (ep->lock) because we manipulate objects51* from inside the poll callback, that might be triggered from52* a wake_up() that in turn might be called from IRQ context.53* So we can't sleep inside the poll callback and hence we need54* a spinlock. During the event transfer loop (from kernel to55* user space) we could end up sleeping due a copy_to_user(), so56* we need a lock that will allow us to sleep. This lock is a57* mutex (ep->mtx). It is acquired during the event transfer loop,58* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().59* Then we also need a global mutex to serialize eventpoll_release_file()60* and ep_free().61* This mutex is acquired by ep_free() during the epoll file62* cleanup path and it is also acquired by eventpoll_release_file()63* if a file has been pushed inside an epoll set and it is then64* close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).65* It is also acquired when inserting an epoll fd onto another epoll66* fd. We do this so that we walk the epoll tree and ensure that this67* insertion does not create a cycle of epoll file descriptors, which68* could lead to deadlock. We need a global mutex to prevent two69* simultaneous inserts (A into B and B into A) from racing and70* constructing a cycle without either insert observing that it is71* going to.72* It is possible to drop the "ep->mtx" and to use the global73* mutex "epmutex" (together with "ep->lock") to have it working,74* but having "ep->mtx" will make the interface more scalable.75* Events that require holding "epmutex" are very rare, while for76* normal operations the epoll private "ep->mtx" will guarantee77* a better scalability.78*/7980/* Epoll private bits inside the event mask */81#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)8283/* Maximum number of nesting allowed inside epoll sets */84#define EP_MAX_NESTS 48586#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))8788#define EP_UNACTIVE_PTR ((void *) -1L)8990#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))9192struct epoll_filefd {93struct file *file;94int fd;95};9697/*98* Structure used to track possible nested calls, for too deep recursions99* and loop cycles.100*/101struct nested_call_node {102struct list_head llink;103void *cookie;104void *ctx;105};106107/*108* This structure is used as collector for nested calls, to check for109* maximum recursion dept and loop cycles.110*/111struct nested_calls {112struct list_head tasks_call_list;113spinlock_t lock;114};115116/*117* Each file descriptor added to the eventpoll interface will118* have an entry of this type linked to the "rbr" RB tree.119*/120struct epitem {121/* RB tree node used to link this structure to the eventpoll RB tree */122struct rb_node rbn;123124/* List header used to link this structure to the eventpoll ready list */125struct list_head rdllink;126127/*128* Works together "struct eventpoll"->ovflist in keeping the129* single linked chain of items.130*/131struct epitem *next;132133/* The file descriptor information this item refers to */134struct epoll_filefd ffd;135136/* Number of active wait queue attached to poll operations */137int nwait;138139/* List containing poll wait queues */140struct list_head pwqlist;141142/* The "container" of this item */143struct eventpoll *ep;144145/* List header used to link this item to the "struct file" items list */146struct list_head fllink;147148/* The structure that describe the interested events and the source fd */149struct epoll_event event;150};151152/*153* This structure is stored inside the "private_data" member of the file154* structure and represents the main data structure for the eventpoll155* interface.156*/157struct eventpoll {158/* Protect the access to this structure */159spinlock_t lock;160161/*162* This mutex is used to ensure that files are not removed163* while epoll is using them. This is held during the event164* collection loop, the file cleanup path, the epoll file exit165* code and the ctl operations.166*/167struct mutex mtx;168169/* Wait queue used by sys_epoll_wait() */170wait_queue_head_t wq;171172/* Wait queue used by file->poll() */173wait_queue_head_t poll_wait;174175/* List of ready file descriptors */176struct list_head rdllist;177178/* RB tree root used to store monitored fd structs */179struct rb_root rbr;180181/*182* This is a single linked list that chains all the "struct epitem" that183* happened while transferring ready events to userspace w/out184* holding ->lock.185*/186struct epitem *ovflist;187188/* The user that created the eventpoll descriptor */189struct user_struct *user;190};191192/* Wait structure used by the poll hooks */193struct eppoll_entry {194/* List header used to link this structure to the "struct epitem" */195struct list_head llink;196197/* The "base" pointer is set to the container "struct epitem" */198struct epitem *base;199200/*201* Wait queue item that will be linked to the target file wait202* queue head.203*/204wait_queue_t wait;205206/* The wait queue head that linked the "wait" wait queue item */207wait_queue_head_t *whead;208};209210/* Wrapper struct used by poll queueing */211struct ep_pqueue {212poll_table pt;213struct epitem *epi;214};215216/* Used by the ep_send_events() function as callback private data */217struct ep_send_events_data {218int maxevents;219struct epoll_event __user *events;220};221222/*223* Configuration options available inside /proc/sys/fs/epoll/224*/225/* Maximum number of epoll watched descriptors, per user */226static long max_user_watches __read_mostly;227228/*229* This mutex is used to serialize ep_free() and eventpoll_release_file().230*/231static DEFINE_MUTEX(epmutex);232233/* Used to check for epoll file descriptor inclusion loops */234static struct nested_calls poll_loop_ncalls;235236/* Used for safe wake up implementation */237static struct nested_calls poll_safewake_ncalls;238239/* Used to call file's f_op->poll() under the nested calls boundaries */240static struct nested_calls poll_readywalk_ncalls;241242/* Slab cache used to allocate "struct epitem" */243static struct kmem_cache *epi_cache __read_mostly;244245/* Slab cache used to allocate "struct eppoll_entry" */246static struct kmem_cache *pwq_cache __read_mostly;247248#ifdef CONFIG_SYSCTL249250#include <linux/sysctl.h>251252static long zero;253static long long_max = LONG_MAX;254255ctl_table epoll_table[] = {256{257.procname = "max_user_watches",258.data = &max_user_watches,259.maxlen = sizeof(max_user_watches),260.mode = 0644,261.proc_handler = proc_doulongvec_minmax,262.extra1 = &zero,263.extra2 = &long_max,264},265{ }266};267#endif /* CONFIG_SYSCTL */268269270/* Setup the structure that is used as key for the RB tree */271static inline void ep_set_ffd(struct epoll_filefd *ffd,272struct file *file, int fd)273{274ffd->file = file;275ffd->fd = fd;276}277278/* Compare RB tree keys */279static inline int ep_cmp_ffd(struct epoll_filefd *p1,280struct epoll_filefd *p2)281{282return (p1->file > p2->file ? +1:283(p1->file < p2->file ? -1 : p1->fd - p2->fd));284}285286/* Tells us if the item is currently linked */287static inline int ep_is_linked(struct list_head *p)288{289return !list_empty(p);290}291292/* Get the "struct epitem" from a wait queue pointer */293static inline struct epitem *ep_item_from_wait(wait_queue_t *p)294{295return container_of(p, struct eppoll_entry, wait)->base;296}297298/* Get the "struct epitem" from an epoll queue wrapper */299static inline struct epitem *ep_item_from_epqueue(poll_table *p)300{301return container_of(p, struct ep_pqueue, pt)->epi;302}303304/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */305static inline int ep_op_has_event(int op)306{307return op != EPOLL_CTL_DEL;308}309310/* Initialize the poll safe wake up structure */311static void ep_nested_calls_init(struct nested_calls *ncalls)312{313INIT_LIST_HEAD(&ncalls->tasks_call_list);314spin_lock_init(&ncalls->lock);315}316317/**318* ep_events_available - Checks if ready events might be available.319*320* @ep: Pointer to the eventpoll context.321*322* Returns: Returns a value different than zero if ready events are available,323* or zero otherwise.324*/325static inline int ep_events_available(struct eventpoll *ep)326{327return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;328}329330/**331* ep_call_nested - Perform a bound (possibly) nested call, by checking332* that the recursion limit is not exceeded, and that333* the same nested call (by the meaning of same cookie) is334* no re-entered.335*336* @ncalls: Pointer to the nested_calls structure to be used for this call.337* @max_nests: Maximum number of allowed nesting calls.338* @nproc: Nested call core function pointer.339* @priv: Opaque data to be passed to the @nproc callback.340* @cookie: Cookie to be used to identify this nested call.341* @ctx: This instance context.342*343* Returns: Returns the code returned by the @nproc callback, or -1 if344* the maximum recursion limit has been exceeded.345*/346static int ep_call_nested(struct nested_calls *ncalls, int max_nests,347int (*nproc)(void *, void *, int), void *priv,348void *cookie, void *ctx)349{350int error, call_nests = 0;351unsigned long flags;352struct list_head *lsthead = &ncalls->tasks_call_list;353struct nested_call_node *tncur;354struct nested_call_node tnode;355356spin_lock_irqsave(&ncalls->lock, flags);357358/*359* Try to see if the current task is already inside this wakeup call.360* We use a list here, since the population inside this set is always361* very much limited.362*/363list_for_each_entry(tncur, lsthead, llink) {364if (tncur->ctx == ctx &&365(tncur->cookie == cookie || ++call_nests > max_nests)) {366/*367* Ops ... loop detected or maximum nest level reached.368* We abort this wake by breaking the cycle itself.369*/370error = -1;371goto out_unlock;372}373}374375/* Add the current task and cookie to the list */376tnode.ctx = ctx;377tnode.cookie = cookie;378list_add(&tnode.llink, lsthead);379380spin_unlock_irqrestore(&ncalls->lock, flags);381382/* Call the nested function */383error = (*nproc)(priv, cookie, call_nests);384385/* Remove the current task from the list */386spin_lock_irqsave(&ncalls->lock, flags);387list_del(&tnode.llink);388out_unlock:389spin_unlock_irqrestore(&ncalls->lock, flags);390391return error;392}393394#ifdef CONFIG_DEBUG_LOCK_ALLOC395static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,396unsigned long events, int subclass)397{398unsigned long flags;399400spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);401wake_up_locked_poll(wqueue, events);402spin_unlock_irqrestore(&wqueue->lock, flags);403}404#else405static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,406unsigned long events, int subclass)407{408wake_up_poll(wqueue, events);409}410#endif411412static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)413{414ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,4151 + call_nests);416return 0;417}418419/*420* Perform a safe wake up of the poll wait list. The problem is that421* with the new callback'd wake up system, it is possible that the422* poll callback is reentered from inside the call to wake_up() done423* on the poll wait queue head. The rule is that we cannot reenter the424* wake up code from the same task more than EP_MAX_NESTS times,425* and we cannot reenter the same wait queue head at all. This will426* enable to have a hierarchy of epoll file descriptor of no more than427* EP_MAX_NESTS deep.428*/429static void ep_poll_safewake(wait_queue_head_t *wq)430{431int this_cpu = get_cpu();432433ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,434ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);435436put_cpu();437}438439/*440* This function unregisters poll callbacks from the associated file441* descriptor. Must be called with "mtx" held (or "epmutex" if called from442* ep_free).443*/444static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)445{446struct list_head *lsthead = &epi->pwqlist;447struct eppoll_entry *pwq;448449while (!list_empty(lsthead)) {450pwq = list_first_entry(lsthead, struct eppoll_entry, llink);451452list_del(&pwq->llink);453remove_wait_queue(pwq->whead, &pwq->wait);454kmem_cache_free(pwq_cache, pwq);455}456}457458/**459* ep_scan_ready_list - Scans the ready list in a way that makes possible for460* the scan code, to call f_op->poll(). Also allows for461* O(NumReady) performance.462*463* @ep: Pointer to the epoll private data structure.464* @sproc: Pointer to the scan callback.465* @priv: Private opaque data passed to the @sproc callback.466*467* Returns: The same integer error code returned by the @sproc callback.468*/469static int ep_scan_ready_list(struct eventpoll *ep,470int (*sproc)(struct eventpoll *,471struct list_head *, void *),472void *priv)473{474int error, pwake = 0;475unsigned long flags;476struct epitem *epi, *nepi;477LIST_HEAD(txlist);478479/*480* We need to lock this because we could be hit by481* eventpoll_release_file() and epoll_ctl().482*/483mutex_lock(&ep->mtx);484485/*486* Steal the ready list, and re-init the original one to the487* empty list. Also, set ep->ovflist to NULL so that events488* happening while looping w/out locks, are not lost. We cannot489* have the poll callback to queue directly on ep->rdllist,490* because we want the "sproc" callback to be able to do it491* in a lockless way.492*/493spin_lock_irqsave(&ep->lock, flags);494list_splice_init(&ep->rdllist, &txlist);495ep->ovflist = NULL;496spin_unlock_irqrestore(&ep->lock, flags);497498/*499* Now call the callback function.500*/501error = (*sproc)(ep, &txlist, priv);502503spin_lock_irqsave(&ep->lock, flags);504/*505* During the time we spent inside the "sproc" callback, some506* other events might have been queued by the poll callback.507* We re-insert them inside the main ready-list here.508*/509for (nepi = ep->ovflist; (epi = nepi) != NULL;510nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {511/*512* We need to check if the item is already in the list.513* During the "sproc" callback execution time, items are514* queued into ->ovflist but the "txlist" might already515* contain them, and the list_splice() below takes care of them.516*/517if (!ep_is_linked(&epi->rdllink))518list_add_tail(&epi->rdllink, &ep->rdllist);519}520/*521* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after522* releasing the lock, events will be queued in the normal way inside523* ep->rdllist.524*/525ep->ovflist = EP_UNACTIVE_PTR;526527/*528* Quickly re-inject items left on "txlist".529*/530list_splice(&txlist, &ep->rdllist);531532if (!list_empty(&ep->rdllist)) {533/*534* Wake up (if active) both the eventpoll wait list and535* the ->poll() wait list (delayed after we release the lock).536*/537if (waitqueue_active(&ep->wq))538wake_up_locked(&ep->wq);539if (waitqueue_active(&ep->poll_wait))540pwake++;541}542spin_unlock_irqrestore(&ep->lock, flags);543544mutex_unlock(&ep->mtx);545546/* We have to call this outside the lock */547if (pwake)548ep_poll_safewake(&ep->poll_wait);549550return error;551}552553/*554* Removes a "struct epitem" from the eventpoll RB tree and deallocates555* all the associated resources. Must be called with "mtx" held.556*/557static int ep_remove(struct eventpoll *ep, struct epitem *epi)558{559unsigned long flags;560struct file *file = epi->ffd.file;561562/*563* Removes poll wait queue hooks. We _have_ to do this without holding564* the "ep->lock" otherwise a deadlock might occur. This because of the565* sequence of the lock acquisition. Here we do "ep->lock" then the wait566* queue head lock when unregistering the wait queue. The wakeup callback567* will run by holding the wait queue head lock and will call our callback568* that will try to get "ep->lock".569*/570ep_unregister_pollwait(ep, epi);571572/* Remove the current item from the list of epoll hooks */573spin_lock(&file->f_lock);574if (ep_is_linked(&epi->fllink))575list_del_init(&epi->fllink);576spin_unlock(&file->f_lock);577578rb_erase(&epi->rbn, &ep->rbr);579580spin_lock_irqsave(&ep->lock, flags);581if (ep_is_linked(&epi->rdllink))582list_del_init(&epi->rdllink);583spin_unlock_irqrestore(&ep->lock, flags);584585/* At this point it is safe to free the eventpoll item */586kmem_cache_free(epi_cache, epi);587588atomic_long_dec(&ep->user->epoll_watches);589590return 0;591}592593static void ep_free(struct eventpoll *ep)594{595struct rb_node *rbp;596struct epitem *epi;597598/* We need to release all tasks waiting for these file */599if (waitqueue_active(&ep->poll_wait))600ep_poll_safewake(&ep->poll_wait);601602/*603* We need to lock this because we could be hit by604* eventpoll_release_file() while we're freeing the "struct eventpoll".605* We do not need to hold "ep->mtx" here because the epoll file606* is on the way to be removed and no one has references to it607* anymore. The only hit might come from eventpoll_release_file() but608* holding "epmutex" is sufficient here.609*/610mutex_lock(&epmutex);611612/*613* Walks through the whole tree by unregistering poll callbacks.614*/615for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {616epi = rb_entry(rbp, struct epitem, rbn);617618ep_unregister_pollwait(ep, epi);619}620621/*622* Walks through the whole tree by freeing each "struct epitem". At this623* point we are sure no poll callbacks will be lingering around, and also by624* holding "epmutex" we can be sure that no file cleanup code will hit625* us during this operation. So we can avoid the lock on "ep->lock".626*/627while ((rbp = rb_first(&ep->rbr)) != NULL) {628epi = rb_entry(rbp, struct epitem, rbn);629ep_remove(ep, epi);630}631632mutex_unlock(&epmutex);633mutex_destroy(&ep->mtx);634free_uid(ep->user);635kfree(ep);636}637638static int ep_eventpoll_release(struct inode *inode, struct file *file)639{640struct eventpoll *ep = file->private_data;641642if (ep)643ep_free(ep);644645return 0;646}647648static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,649void *priv)650{651struct epitem *epi, *tmp;652653list_for_each_entry_safe(epi, tmp, head, rdllink) {654if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &655epi->event.events)656return POLLIN | POLLRDNORM;657else {658/*659* Item has been dropped into the ready list by the poll660* callback, but it's not actually ready, as far as661* caller requested events goes. We can remove it here.662*/663list_del_init(&epi->rdllink);664}665}666667return 0;668}669670static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)671{672return ep_scan_ready_list(priv, ep_read_events_proc, NULL);673}674675static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)676{677int pollflags;678struct eventpoll *ep = file->private_data;679680/* Insert inside our poll wait queue */681poll_wait(file, &ep->poll_wait, wait);682683/*684* Proceed to find out if wanted events are really available inside685* the ready list. This need to be done under ep_call_nested()686* supervision, since the call to f_op->poll() done on listed files687* could re-enter here.688*/689pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,690ep_poll_readyevents_proc, ep, ep, current);691692return pollflags != -1 ? pollflags : 0;693}694695/* File callbacks that implement the eventpoll file behaviour */696static const struct file_operations eventpoll_fops = {697.release = ep_eventpoll_release,698.poll = ep_eventpoll_poll,699.llseek = noop_llseek,700};701702/* Fast test to see if the file is an evenpoll file */703static inline int is_file_epoll(struct file *f)704{705return f->f_op == &eventpoll_fops;706}707708/*709* This is called from eventpoll_release() to unlink files from the eventpoll710* interface. We need to have this facility to cleanup correctly files that are711* closed without being removed from the eventpoll interface.712*/713void eventpoll_release_file(struct file *file)714{715struct list_head *lsthead = &file->f_ep_links;716struct eventpoll *ep;717struct epitem *epi;718719/*720* We don't want to get "file->f_lock" because it is not721* necessary. It is not necessary because we're in the "struct file"722* cleanup path, and this means that no one is using this file anymore.723* So, for example, epoll_ctl() cannot hit here since if we reach this724* point, the file counter already went to zero and fget() would fail.725* The only hit might come from ep_free() but by holding the mutex726* will correctly serialize the operation. We do need to acquire727* "ep->mtx" after "epmutex" because ep_remove() requires it when called728* from anywhere but ep_free().729*730* Besides, ep_remove() acquires the lock, so we can't hold it here.731*/732mutex_lock(&epmutex);733734while (!list_empty(lsthead)) {735epi = list_first_entry(lsthead, struct epitem, fllink);736737ep = epi->ep;738list_del_init(&epi->fllink);739mutex_lock(&ep->mtx);740ep_remove(ep, epi);741mutex_unlock(&ep->mtx);742}743744mutex_unlock(&epmutex);745}746747static int ep_alloc(struct eventpoll **pep)748{749int error;750struct user_struct *user;751struct eventpoll *ep;752753user = get_current_user();754error = -ENOMEM;755ep = kzalloc(sizeof(*ep), GFP_KERNEL);756if (unlikely(!ep))757goto free_uid;758759spin_lock_init(&ep->lock);760mutex_init(&ep->mtx);761init_waitqueue_head(&ep->wq);762init_waitqueue_head(&ep->poll_wait);763INIT_LIST_HEAD(&ep->rdllist);764ep->rbr = RB_ROOT;765ep->ovflist = EP_UNACTIVE_PTR;766ep->user = user;767768*pep = ep;769770return 0;771772free_uid:773free_uid(user);774return error;775}776777/*778* Search the file inside the eventpoll tree. The RB tree operations779* are protected by the "mtx" mutex, and ep_find() must be called with780* "mtx" held.781*/782static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)783{784int kcmp;785struct rb_node *rbp;786struct epitem *epi, *epir = NULL;787struct epoll_filefd ffd;788789ep_set_ffd(&ffd, file, fd);790for (rbp = ep->rbr.rb_node; rbp; ) {791epi = rb_entry(rbp, struct epitem, rbn);792kcmp = ep_cmp_ffd(&ffd, &epi->ffd);793if (kcmp > 0)794rbp = rbp->rb_right;795else if (kcmp < 0)796rbp = rbp->rb_left;797else {798epir = epi;799break;800}801}802803return epir;804}805806/*807* This is the callback that is passed to the wait queue wakeup808* mechanism. It is called by the stored file descriptors when they809* have events to report.810*/811static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)812{813int pwake = 0;814unsigned long flags;815struct epitem *epi = ep_item_from_wait(wait);816struct eventpoll *ep = epi->ep;817818spin_lock_irqsave(&ep->lock, flags);819820/*821* If the event mask does not contain any poll(2) event, we consider the822* descriptor to be disabled. This condition is likely the effect of the823* EPOLLONESHOT bit that disables the descriptor when an event is received,824* until the next EPOLL_CTL_MOD will be issued.825*/826if (!(epi->event.events & ~EP_PRIVATE_BITS))827goto out_unlock;828829/*830* Check the events coming with the callback. At this stage, not831* every device reports the events in the "key" parameter of the832* callback. We need to be able to handle both cases here, hence the833* test for "key" != NULL before the event match test.834*/835if (key && !((unsigned long) key & epi->event.events))836goto out_unlock;837838/*839* If we are transferring events to userspace, we can hold no locks840* (because we're accessing user memory, and because of linux f_op->poll()841* semantics). All the events that happen during that period of time are842* chained in ep->ovflist and requeued later on.843*/844if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {845if (epi->next == EP_UNACTIVE_PTR) {846epi->next = ep->ovflist;847ep->ovflist = epi;848}849goto out_unlock;850}851852/* If this file is already in the ready list we exit soon */853if (!ep_is_linked(&epi->rdllink))854list_add_tail(&epi->rdllink, &ep->rdllist);855856/*857* Wake up ( if active ) both the eventpoll wait list and the ->poll()858* wait list.859*/860if (waitqueue_active(&ep->wq))861wake_up_locked(&ep->wq);862if (waitqueue_active(&ep->poll_wait))863pwake++;864865out_unlock:866spin_unlock_irqrestore(&ep->lock, flags);867868/* We have to call this outside the lock */869if (pwake)870ep_poll_safewake(&ep->poll_wait);871872return 1;873}874875/*876* This is the callback that is used to add our wait queue to the877* target file wakeup lists.878*/879static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,880poll_table *pt)881{882struct epitem *epi = ep_item_from_epqueue(pt);883struct eppoll_entry *pwq;884885if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {886init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);887pwq->whead = whead;888pwq->base = epi;889add_wait_queue(whead, &pwq->wait);890list_add_tail(&pwq->llink, &epi->pwqlist);891epi->nwait++;892} else {893/* We have to signal that an error occurred */894epi->nwait = -1;895}896}897898static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)899{900int kcmp;901struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;902struct epitem *epic;903904while (*p) {905parent = *p;906epic = rb_entry(parent, struct epitem, rbn);907kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);908if (kcmp > 0)909p = &parent->rb_right;910else911p = &parent->rb_left;912}913rb_link_node(&epi->rbn, parent, p);914rb_insert_color(&epi->rbn, &ep->rbr);915}916917/*918* Must be called with "mtx" held.919*/920static int ep_insert(struct eventpoll *ep, struct epoll_event *event,921struct file *tfile, int fd)922{923int error, revents, pwake = 0;924unsigned long flags;925long user_watches;926struct epitem *epi;927struct ep_pqueue epq;928929user_watches = atomic_long_read(&ep->user->epoll_watches);930if (unlikely(user_watches >= max_user_watches))931return -ENOSPC;932if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))933return -ENOMEM;934935/* Item initialization follow here ... */936INIT_LIST_HEAD(&epi->rdllink);937INIT_LIST_HEAD(&epi->fllink);938INIT_LIST_HEAD(&epi->pwqlist);939epi->ep = ep;940ep_set_ffd(&epi->ffd, tfile, fd);941epi->event = *event;942epi->nwait = 0;943epi->next = EP_UNACTIVE_PTR;944945/* Initialize the poll table using the queue callback */946epq.epi = epi;947init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);948949/*950* Attach the item to the poll hooks and get current event bits.951* We can safely use the file* here because its usage count has952* been increased by the caller of this function. Note that after953* this operation completes, the poll callback can start hitting954* the new item.955*/956revents = tfile->f_op->poll(tfile, &epq.pt);957958/*959* We have to check if something went wrong during the poll wait queue960* install process. Namely an allocation for a wait queue failed due961* high memory pressure.962*/963error = -ENOMEM;964if (epi->nwait < 0)965goto error_unregister;966967/* Add the current item to the list of active epoll hook for this file */968spin_lock(&tfile->f_lock);969list_add_tail(&epi->fllink, &tfile->f_ep_links);970spin_unlock(&tfile->f_lock);971972/*973* Add the current item to the RB tree. All RB tree operations are974* protected by "mtx", and ep_insert() is called with "mtx" held.975*/976ep_rbtree_insert(ep, epi);977978/* We have to drop the new item inside our item list to keep track of it */979spin_lock_irqsave(&ep->lock, flags);980981/* If the file is already "ready" we drop it inside the ready list */982if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {983list_add_tail(&epi->rdllink, &ep->rdllist);984985/* Notify waiting tasks that events are available */986if (waitqueue_active(&ep->wq))987wake_up_locked(&ep->wq);988if (waitqueue_active(&ep->poll_wait))989pwake++;990}991992spin_unlock_irqrestore(&ep->lock, flags);993994atomic_long_inc(&ep->user->epoll_watches);995996/* We have to call this outside the lock */997if (pwake)998ep_poll_safewake(&ep->poll_wait);9991000return 0;10011002error_unregister:1003ep_unregister_pollwait(ep, epi);10041005/*1006* We need to do this because an event could have been arrived on some1007* allocated wait queue. Note that we don't care about the ep->ovflist1008* list, since that is used/cleaned only inside a section bound by "mtx".1009* And ep_insert() is called with "mtx" held.1010*/1011spin_lock_irqsave(&ep->lock, flags);1012if (ep_is_linked(&epi->rdllink))1013list_del_init(&epi->rdllink);1014spin_unlock_irqrestore(&ep->lock, flags);10151016kmem_cache_free(epi_cache, epi);10171018return error;1019}10201021/*1022* Modify the interest event mask by dropping an event if the new mask1023* has a match in the current file status. Must be called with "mtx" held.1024*/1025static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)1026{1027int pwake = 0;1028unsigned int revents;10291030/*1031* Set the new event interest mask before calling f_op->poll();1032* otherwise we might miss an event that happens between the1033* f_op->poll() call and the new event set registering.1034*/1035epi->event.events = event->events;1036epi->event.data = event->data; /* protected by mtx */10371038/*1039* Get current event bits. We can safely use the file* here because1040* its usage count has been increased by the caller of this function.1041*/1042revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);10431044/*1045* If the item is "hot" and it is not registered inside the ready1046* list, push it inside.1047*/1048if (revents & event->events) {1049spin_lock_irq(&ep->lock);1050if (!ep_is_linked(&epi->rdllink)) {1051list_add_tail(&epi->rdllink, &ep->rdllist);10521053/* Notify waiting tasks that events are available */1054if (waitqueue_active(&ep->wq))1055wake_up_locked(&ep->wq);1056if (waitqueue_active(&ep->poll_wait))1057pwake++;1058}1059spin_unlock_irq(&ep->lock);1060}10611062/* We have to call this outside the lock */1063if (pwake)1064ep_poll_safewake(&ep->poll_wait);10651066return 0;1067}10681069static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,1070void *priv)1071{1072struct ep_send_events_data *esed = priv;1073int eventcnt;1074unsigned int revents;1075struct epitem *epi;1076struct epoll_event __user *uevent;10771078/*1079* We can loop without lock because we are passed a task private list.1080* Items cannot vanish during the loop because ep_scan_ready_list() is1081* holding "mtx" during this call.1082*/1083for (eventcnt = 0, uevent = esed->events;1084!list_empty(head) && eventcnt < esed->maxevents;) {1085epi = list_first_entry(head, struct epitem, rdllink);10861087list_del_init(&epi->rdllink);10881089revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &1090epi->event.events;10911092/*1093* If the event mask intersect the caller-requested one,1094* deliver the event to userspace. Again, ep_scan_ready_list()1095* is holding "mtx", so no operations coming from userspace1096* can change the item.1097*/1098if (revents) {1099if (__put_user(revents, &uevent->events) ||1100__put_user(epi->event.data, &uevent->data)) {1101list_add(&epi->rdllink, head);1102return eventcnt ? eventcnt : -EFAULT;1103}1104eventcnt++;1105uevent++;1106if (epi->event.events & EPOLLONESHOT)1107epi->event.events &= EP_PRIVATE_BITS;1108else if (!(epi->event.events & EPOLLET)) {1109/*1110* If this file has been added with Level1111* Trigger mode, we need to insert back inside1112* the ready list, so that the next call to1113* epoll_wait() will check again the events1114* availability. At this point, no one can insert1115* into ep->rdllist besides us. The epoll_ctl()1116* callers are locked out by1117* ep_scan_ready_list() holding "mtx" and the1118* poll callback will queue them in ep->ovflist.1119*/1120list_add_tail(&epi->rdllink, &ep->rdllist);1121}1122}1123}11241125return eventcnt;1126}11271128static int ep_send_events(struct eventpoll *ep,1129struct epoll_event __user *events, int maxevents)1130{1131struct ep_send_events_data esed;11321133esed.maxevents = maxevents;1134esed.events = events;11351136return ep_scan_ready_list(ep, ep_send_events_proc, &esed);1137}11381139static inline struct timespec ep_set_mstimeout(long ms)1140{1141struct timespec now, ts = {1142.tv_sec = ms / MSEC_PER_SEC,1143.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),1144};11451146ktime_get_ts(&now);1147return timespec_add_safe(now, ts);1148}11491150/**1151* ep_poll - Retrieves ready events, and delivers them to the caller supplied1152* event buffer.1153*1154* @ep: Pointer to the eventpoll context.1155* @events: Pointer to the userspace buffer where the ready events should be1156* stored.1157* @maxevents: Size (in terms of number of events) of the caller event buffer.1158* @timeout: Maximum timeout for the ready events fetch operation, in1159* milliseconds. If the @timeout is zero, the function will not block,1160* while if the @timeout is less than zero, the function will block1161* until at least one event has been retrieved (or an error1162* occurred).1163*1164* Returns: Returns the number of ready events which have been fetched, or an1165* error code, in case of error.1166*/1167static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,1168int maxevents, long timeout)1169{1170int res = 0, eavail, timed_out = 0;1171unsigned long flags;1172long slack = 0;1173wait_queue_t wait;1174ktime_t expires, *to = NULL;11751176if (timeout > 0) {1177struct timespec end_time = ep_set_mstimeout(timeout);11781179slack = select_estimate_accuracy(&end_time);1180to = &expires;1181*to = timespec_to_ktime(end_time);1182} else if (timeout == 0) {1183/*1184* Avoid the unnecessary trip to the wait queue loop, if the1185* caller specified a non blocking operation.1186*/1187timed_out = 1;1188spin_lock_irqsave(&ep->lock, flags);1189goto check_events;1190}11911192fetch_events:1193spin_lock_irqsave(&ep->lock, flags);11941195if (!ep_events_available(ep)) {1196/*1197* We don't have any available event to return to the caller.1198* We need to sleep here, and we will be wake up by1199* ep_poll_callback() when events will become available.1200*/1201init_waitqueue_entry(&wait, current);1202__add_wait_queue_exclusive(&ep->wq, &wait);12031204for (;;) {1205/*1206* We don't want to sleep if the ep_poll_callback() sends us1207* a wakeup in between. That's why we set the task state1208* to TASK_INTERRUPTIBLE before doing the checks.1209*/1210set_current_state(TASK_INTERRUPTIBLE);1211if (ep_events_available(ep) || timed_out)1212break;1213if (signal_pending(current)) {1214res = -EINTR;1215break;1216}12171218spin_unlock_irqrestore(&ep->lock, flags);1219if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))1220timed_out = 1;12211222spin_lock_irqsave(&ep->lock, flags);1223}1224__remove_wait_queue(&ep->wq, &wait);12251226set_current_state(TASK_RUNNING);1227}1228check_events:1229/* Is it worth to try to dig for events ? */1230eavail = ep_events_available(ep);12311232spin_unlock_irqrestore(&ep->lock, flags);12331234/*1235* Try to transfer events to user space. In case we get 0 events and1236* there's still timeout left over, we go trying again in search of1237* more luck.1238*/1239if (!res && eavail &&1240!(res = ep_send_events(ep, events, maxevents)) && !timed_out)1241goto fetch_events;12421243return res;1244}12451246/**1247* ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()1248* API, to verify that adding an epoll file inside another1249* epoll structure, does not violate the constraints, in1250* terms of closed loops, or too deep chains (which can1251* result in excessive stack usage).1252*1253* @priv: Pointer to the epoll file to be currently checked.1254* @cookie: Original cookie for this call. This is the top-of-the-chain epoll1255* data structure pointer.1256* @call_nests: Current dept of the @ep_call_nested() call stack.1257*1258* Returns: Returns zero if adding the epoll @file inside current epoll1259* structure @ep does not violate the constraints, or -1 otherwise.1260*/1261static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)1262{1263int error = 0;1264struct file *file = priv;1265struct eventpoll *ep = file->private_data;1266struct rb_node *rbp;1267struct epitem *epi;12681269mutex_lock(&ep->mtx);1270for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {1271epi = rb_entry(rbp, struct epitem, rbn);1272if (unlikely(is_file_epoll(epi->ffd.file))) {1273error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,1274ep_loop_check_proc, epi->ffd.file,1275epi->ffd.file->private_data, current);1276if (error != 0)1277break;1278}1279}1280mutex_unlock(&ep->mtx);12811282return error;1283}12841285/**1286* ep_loop_check - Performs a check to verify that adding an epoll file (@file)1287* another epoll file (represented by @ep) does not create1288* closed loops or too deep chains.1289*1290* @ep: Pointer to the epoll private data structure.1291* @file: Pointer to the epoll file to be checked.1292*1293* Returns: Returns zero if adding the epoll @file inside current epoll1294* structure @ep does not violate the constraints, or -1 otherwise.1295*/1296static int ep_loop_check(struct eventpoll *ep, struct file *file)1297{1298return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,1299ep_loop_check_proc, file, ep, current);1300}13011302/*1303* Open an eventpoll file descriptor.1304*/1305SYSCALL_DEFINE1(epoll_create1, int, flags)1306{1307int error;1308struct eventpoll *ep = NULL;13091310/* Check the EPOLL_* constant for consistency. */1311BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);13121313if (flags & ~EPOLL_CLOEXEC)1314return -EINVAL;1315/*1316* Create the internal data structure ("struct eventpoll").1317*/1318error = ep_alloc(&ep);1319if (error < 0)1320return error;1321/*1322* Creates all the items needed to setup an eventpoll file. That is,1323* a file structure and a free file descriptor.1324*/1325error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,1326O_RDWR | (flags & O_CLOEXEC));1327if (error < 0)1328ep_free(ep);13291330return error;1331}13321333SYSCALL_DEFINE1(epoll_create, int, size)1334{1335if (size <= 0)1336return -EINVAL;13371338return sys_epoll_create1(0);1339}13401341/*1342* The following function implements the controller interface for1343* the eventpoll file that enables the insertion/removal/change of1344* file descriptors inside the interest set.1345*/1346SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,1347struct epoll_event __user *, event)1348{1349int error;1350int did_lock_epmutex = 0;1351struct file *file, *tfile;1352struct eventpoll *ep;1353struct epitem *epi;1354struct epoll_event epds;13551356error = -EFAULT;1357if (ep_op_has_event(op) &&1358copy_from_user(&epds, event, sizeof(struct epoll_event)))1359goto error_return;13601361/* Get the "struct file *" for the eventpoll file */1362error = -EBADF;1363file = fget(epfd);1364if (!file)1365goto error_return;13661367/* Get the "struct file *" for the target file */1368tfile = fget(fd);1369if (!tfile)1370goto error_fput;13711372/* The target file descriptor must support poll */1373error = -EPERM;1374if (!tfile->f_op || !tfile->f_op->poll)1375goto error_tgt_fput;13761377/*1378* We have to check that the file structure underneath the file descriptor1379* the user passed to us _is_ an eventpoll file. And also we do not permit1380* adding an epoll file descriptor inside itself.1381*/1382error = -EINVAL;1383if (file == tfile || !is_file_epoll(file))1384goto error_tgt_fput;13851386/*1387* At this point it is safe to assume that the "private_data" contains1388* our own data structure.1389*/1390ep = file->private_data;13911392/*1393* When we insert an epoll file descriptor, inside another epoll file1394* descriptor, there is the change of creating closed loops, which are1395* better be handled here, than in more critical paths.1396*1397* We hold epmutex across the loop check and the insert in this case, in1398* order to prevent two separate inserts from racing and each doing the1399* insert "at the same time" such that ep_loop_check passes on both1400* before either one does the insert, thereby creating a cycle.1401*/1402if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {1403mutex_lock(&epmutex);1404did_lock_epmutex = 1;1405error = -ELOOP;1406if (ep_loop_check(ep, tfile) != 0)1407goto error_tgt_fput;1408}140914101411mutex_lock(&ep->mtx);14121413/*1414* Try to lookup the file inside our RB tree, Since we grabbed "mtx"1415* above, we can be sure to be able to use the item looked up by1416* ep_find() till we release the mutex.1417*/1418epi = ep_find(ep, tfile, fd);14191420error = -EINVAL;1421switch (op) {1422case EPOLL_CTL_ADD:1423if (!epi) {1424epds.events |= POLLERR | POLLHUP;1425error = ep_insert(ep, &epds, tfile, fd);1426} else1427error = -EEXIST;1428break;1429case EPOLL_CTL_DEL:1430if (epi)1431error = ep_remove(ep, epi);1432else1433error = -ENOENT;1434break;1435case EPOLL_CTL_MOD:1436if (epi) {1437epds.events |= POLLERR | POLLHUP;1438error = ep_modify(ep, epi, &epds);1439} else1440error = -ENOENT;1441break;1442}1443mutex_unlock(&ep->mtx);14441445error_tgt_fput:1446if (unlikely(did_lock_epmutex))1447mutex_unlock(&epmutex);14481449fput(tfile);1450error_fput:1451fput(file);1452error_return:14531454return error;1455}14561457/*1458* Implement the event wait interface for the eventpoll file. It is the kernel1459* part of the user space epoll_wait(2).1460*/1461SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,1462int, maxevents, int, timeout)1463{1464int error;1465struct file *file;1466struct eventpoll *ep;14671468/* The maximum number of event must be greater than zero */1469if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)1470return -EINVAL;14711472/* Verify that the area passed by the user is writeable */1473if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {1474error = -EFAULT;1475goto error_return;1476}14771478/* Get the "struct file *" for the eventpoll file */1479error = -EBADF;1480file = fget(epfd);1481if (!file)1482goto error_return;14831484/*1485* We have to check that the file structure underneath the fd1486* the user passed to us _is_ an eventpoll file.1487*/1488error = -EINVAL;1489if (!is_file_epoll(file))1490goto error_fput;14911492/*1493* At this point it is safe to assume that the "private_data" contains1494* our own data structure.1495*/1496ep = file->private_data;14971498/* Time to fish for events ... */1499error = ep_poll(ep, events, maxevents, timeout);15001501error_fput:1502fput(file);1503error_return:15041505return error;1506}15071508#ifdef HAVE_SET_RESTORE_SIGMASK15091510/*1511* Implement the event wait interface for the eventpoll file. It is the kernel1512* part of the user space epoll_pwait(2).1513*/1514SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,1515int, maxevents, int, timeout, const sigset_t __user *, sigmask,1516size_t, sigsetsize)1517{1518int error;1519sigset_t ksigmask, sigsaved;15201521/*1522* If the caller wants a certain signal mask to be set during the wait,1523* we apply it here.1524*/1525if (sigmask) {1526if (sigsetsize != sizeof(sigset_t))1527return -EINVAL;1528if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))1529return -EFAULT;1530sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));1531sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);1532}15331534error = sys_epoll_wait(epfd, events, maxevents, timeout);15351536/*1537* If we changed the signal mask, we need to restore the original one.1538* In case we've got a signal while waiting, we do not restore the1539* signal mask yet, and we allow do_signal() to deliver the signal on1540* the way back to userspace, before the signal mask is restored.1541*/1542if (sigmask) {1543if (error == -EINTR) {1544memcpy(¤t->saved_sigmask, &sigsaved,1545sizeof(sigsaved));1546set_restore_sigmask();1547} else1548sigprocmask(SIG_SETMASK, &sigsaved, NULL);1549}15501551return error;1552}15531554#endif /* HAVE_SET_RESTORE_SIGMASK */15551556static int __init eventpoll_init(void)1557{1558struct sysinfo si;15591560si_meminfo(&si);1561/*1562* Allows top 4% of lomem to be allocated for epoll watches (per user).1563*/1564max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /1565EP_ITEM_COST;1566BUG_ON(max_user_watches < 0);15671568/*1569* Initialize the structure used to perform epoll file descriptor1570* inclusion loops checks.1571*/1572ep_nested_calls_init(&poll_loop_ncalls);15731574/* Initialize the structure used to perform safe poll wait head wake ups */1575ep_nested_calls_init(&poll_safewake_ncalls);15761577/* Initialize the structure used to perform file's f_op->poll() calls */1578ep_nested_calls_init(&poll_readywalk_ncalls);15791580/* Allocates slab cache used to allocate "struct epitem" items */1581epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),15820, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);15831584/* Allocates slab cache used to allocate "struct eppoll_entry" */1585pwq_cache = kmem_cache_create("eventpoll_pwq",1586sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);15871588return 0;1589}1590fs_initcall(eventpoll_init);159115921593