// SPDX-License-Identifier: GPL-2.0-or-later1/*2* fs/eventpoll.c (Efficient event retrieval implementation)3* Copyright (C) 2001,...,2009 Davide Libenzi4*5* Davide Libenzi <[email protected]>6*/78#include <linux/init.h>9#include <linux/kernel.h>10#include <linux/sched/signal.h>11#include <linux/fs.h>12#include <linux/file.h>13#include <linux/signal.h>14#include <linux/errno.h>15#include <linux/mm.h>16#include <linux/slab.h>17#include <linux/poll.h>18#include <linux/string.h>19#include <linux/list.h>20#include <linux/hash.h>21#include <linux/spinlock.h>22#include <linux/syscalls.h>23#include <linux/rbtree.h>24#include <linux/wait.h>25#include <linux/eventpoll.h>26#include <linux/mount.h>27#include <linux/bitops.h>28#include <linux/mutex.h>29#include <linux/anon_inodes.h>30#include <linux/device.h>31#include <linux/uaccess.h>32#include <asm/io.h>33#include <asm/mman.h>34#include <linux/atomic.h>35#include <linux/proc_fs.h>36#include <linux/seq_file.h>37#include <linux/compat.h>38#include <linux/rculist.h>39#include <linux/capability.h>40#include <net/busy_poll.h>4142/*43* LOCKING:44* There are three level of locking required by epoll :45*46* 1) epnested_mutex (mutex)47* 2) ep->mtx (mutex)48* 3) ep->lock (rwlock)49*50* The acquire order is the one listed above, from 1 to 3.51* We need a rwlock (ep->lock) because we manipulate objects52* from inside the poll callback, that might be triggered from53* a wake_up() that in turn might be called from IRQ context.54* So we can't sleep inside the poll callback and hence we need55* a spinlock. During the event transfer loop (from kernel to56* user space) we could end up sleeping due a copy_to_user(), so57* we need a lock that will allow us to sleep. This lock is a58* mutex (ep->mtx). It is acquired during the event transfer loop,59* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().60* The epnested_mutex is acquired when inserting an epoll fd onto another61* epoll fd. We do this so that we walk the epoll tree and ensure that this62* insertion does not create a cycle of epoll file descriptors, which63* could lead to deadlock. We need a global mutex to prevent two64* simultaneous inserts (A into B and B into A) from racing and65* constructing a cycle without either insert observing that it is66* going to.67* It is necessary to acquire multiple "ep->mtx"es at once in the68* case when one epoll fd is added to another. In this case, we69* always acquire the locks in the order of nesting (i.e. after70* epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired71* before e2->mtx). Since we disallow cycles of epoll file72* descriptors, this ensures that the mutexes are well-ordered. In73* order to communicate this nesting to lockdep, when walking a tree74* of epoll file descriptors, we use the current recursion depth as75* the lockdep subkey.76* It is possible to drop the "ep->mtx" and to use the global77* mutex "epnested_mutex" (together with "ep->lock") to have it working,78* but having "ep->mtx" will make the interface more scalable.79* Events that require holding "epnested_mutex" are very rare, while for80* normal operations the epoll private "ep->mtx" will guarantee81* a better scalability.82*/8384/* Epoll private bits inside the event mask */85#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)8687#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)8889#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \90EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)9192/* Maximum number of nesting allowed inside epoll sets */93#define EP_MAX_NESTS 49495#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))9697#define EP_UNACTIVE_PTR ((void *) -1L)9899#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))100101struct epoll_filefd {102struct file *file;103int fd;104} __packed;105106/* Wait structure used by the poll hooks */107struct eppoll_entry {108/* List header used to link this structure to the "struct epitem" */109struct eppoll_entry *next;110111/* The "base" pointer is set to the container "struct epitem" */112struct epitem *base;113114/*115* Wait queue item that will be linked to the target file wait116* queue head.117*/118wait_queue_entry_t wait;119120/* The wait queue head that linked the "wait" wait queue item */121wait_queue_head_t *whead;122};123124/*125* Each file descriptor added to the eventpoll interface will126* have an entry of this type linked to the "rbr" RB tree.127* Avoid increasing the size of this struct, there can be many thousands128* of these on a server and we do not want this to take another cache line.129*/130struct epitem {131union {132/* RB tree node links this structure to the eventpoll RB tree */133struct rb_node rbn;134/* Used to free the struct epitem */135struct rcu_head rcu;136};137138/* List header used to link this structure to the eventpoll ready list */139struct list_head rdllink;140141/*142* Works together "struct eventpoll"->ovflist in keeping the143* single linked chain of items.144*/145struct epitem *next;146147/* The file descriptor information this item refers to */148struct epoll_filefd ffd;149150/*151* Protected by file->f_lock, true for to-be-released epitem already152* removed from the "struct file" items list; together with153* eventpoll->refcount orchestrates "struct eventpoll" disposal154*/155bool dying;156157/* List containing poll wait queues */158struct eppoll_entry *pwqlist;159160/* The "container" of this item */161struct eventpoll *ep;162163/* List header used to link this item to the "struct file" items list */164struct hlist_node fllink;165166/* wakeup_source used when EPOLLWAKEUP is set */167struct wakeup_source __rcu *ws;168169/* The structure that describe the interested events and the source fd */170struct epoll_event event;171};172173/*174* This structure is stored inside the "private_data" member of the file175* structure and represents the main data structure for the eventpoll176* interface.177*/178struct eventpoll {179/*180* This mutex is used to ensure that files are not removed181* while epoll is using them. This is held during the event182* collection loop, the file cleanup path, the epoll file exit183* code and the ctl operations.184*/185struct mutex mtx;186187/* Wait queue used by sys_epoll_wait() */188wait_queue_head_t wq;189190/* Wait queue used by file->poll() */191wait_queue_head_t poll_wait;192193/* List of ready file descriptors */194struct list_head rdllist;195196/* Lock which protects rdllist and ovflist */197rwlock_t lock;198199/* RB tree root used to store monitored fd structs */200struct rb_root_cached rbr;201202/*203* This is a single linked list that chains all the "struct epitem" that204* happened while transferring ready events to userspace w/out205* holding ->lock.206*/207struct epitem *ovflist;208209/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */210struct wakeup_source *ws;211212/* The user that created the eventpoll descriptor */213struct user_struct *user;214215struct file *file;216217/* used to optimize loop detection check */218u64 gen;219struct hlist_head refs;220u8 loop_check_depth;221222/*223* usage count, used together with epitem->dying to224* orchestrate the disposal of this struct225*/226refcount_t refcount;227228#ifdef CONFIG_NET_RX_BUSY_POLL229/* used to track busy poll napi_id */230unsigned int napi_id;231/* busy poll timeout */232u32 busy_poll_usecs;233/* busy poll packet budget */234u16 busy_poll_budget;235bool prefer_busy_poll;236#endif237238#ifdef CONFIG_DEBUG_LOCK_ALLOC239/* tracks wakeup nests for lockdep validation */240u8 nests;241#endif242};243244/* Wrapper struct used by poll queueing */245struct ep_pqueue {246poll_table pt;247struct epitem *epi;248};249250/*251* Configuration options available inside /proc/sys/fs/epoll/252*/253/* Maximum number of epoll watched descriptors, per user */254static long max_user_watches __read_mostly;255256/* Used for cycles detection */257static DEFINE_MUTEX(epnested_mutex);258259static u64 loop_check_gen = 0;260261/* Used to check for epoll file descriptor inclusion loops */262static struct eventpoll *inserting_into;263264/* Slab cache used to allocate "struct epitem" */265static struct kmem_cache *epi_cache __ro_after_init;266267/* Slab cache used to allocate "struct eppoll_entry" */268static struct kmem_cache *pwq_cache __ro_after_init;269270/*271* List of files with newly added links, where we may need to limit the number272* of emanating paths. Protected by the epnested_mutex.273*/274struct epitems_head {275struct hlist_head epitems;276struct epitems_head *next;277};278static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;279280static struct kmem_cache *ephead_cache __ro_after_init;281282static inline void free_ephead(struct epitems_head *head)283{284if (head)285kmem_cache_free(ephead_cache, head);286}287288static void list_file(struct file *file)289{290struct epitems_head *head;291292head = container_of(file->f_ep, struct epitems_head, epitems);293if (!head->next) {294head->next = tfile_check_list;295tfile_check_list = head;296}297}298299static void unlist_file(struct epitems_head *head)300{301struct epitems_head *to_free = head;302struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));303if (p) {304struct epitem *epi= container_of(p, struct epitem, fllink);305spin_lock(&epi->ffd.file->f_lock);306if (!hlist_empty(&head->epitems))307to_free = NULL;308head->next = NULL;309spin_unlock(&epi->ffd.file->f_lock);310}311free_ephead(to_free);312}313314#ifdef CONFIG_SYSCTL315316#include <linux/sysctl.h>317318static long long_zero;319static long long_max = LONG_MAX;320321static const struct ctl_table epoll_table[] = {322{323.procname = "max_user_watches",324.data = &max_user_watches,325.maxlen = sizeof(max_user_watches),326.mode = 0644,327.proc_handler = proc_doulongvec_minmax,328.extra1 = &long_zero,329.extra2 = &long_max,330},331};332333static void __init epoll_sysctls_init(void)334{335register_sysctl("fs/epoll", epoll_table);336}337#else338#define epoll_sysctls_init() do { } while (0)339#endif /* CONFIG_SYSCTL */340341static const struct file_operations eventpoll_fops;342343static inline int is_file_epoll(struct file *f)344{345return f->f_op == &eventpoll_fops;346}347348/* Setup the structure that is used as key for the RB tree */349static inline void ep_set_ffd(struct epoll_filefd *ffd,350struct file *file, int fd)351{352ffd->file = file;353ffd->fd = fd;354}355356/* Compare RB tree keys */357static inline int ep_cmp_ffd(struct epoll_filefd *p1,358struct epoll_filefd *p2)359{360return (p1->file > p2->file ? +1:361(p1->file < p2->file ? -1 : p1->fd - p2->fd));362}363364/* Tells us if the item is currently linked */365static inline int ep_is_linked(struct epitem *epi)366{367return !list_empty(&epi->rdllink);368}369370static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)371{372return container_of(p, struct eppoll_entry, wait);373}374375/* Get the "struct epitem" from a wait queue pointer */376static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)377{378return container_of(p, struct eppoll_entry, wait)->base;379}380381/**382* ep_events_available - Checks if ready events might be available.383*384* @ep: Pointer to the eventpoll context.385*386* Return: a value different than %zero if ready events are available,387* or %zero otherwise.388*/389static inline int ep_events_available(struct eventpoll *ep)390{391return !list_empty_careful(&ep->rdllist) ||392READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;393}394395#ifdef CONFIG_NET_RX_BUSY_POLL396/**397* busy_loop_ep_timeout - check if busy poll has timed out. The timeout value398* from the epoll instance ep is preferred, but if it is not set fallback to399* the system-wide global via busy_loop_timeout.400*401* @start_time: The start time used to compute the remaining time until timeout.402* @ep: Pointer to the eventpoll context.403*404* Return: true if the timeout has expired, false otherwise.405*/406static bool busy_loop_ep_timeout(unsigned long start_time,407struct eventpoll *ep)408{409unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);410411if (bp_usec) {412unsigned long end_time = start_time + bp_usec;413unsigned long now = busy_loop_current_time();414415return time_after(now, end_time);416} else {417return busy_loop_timeout(start_time);418}419}420421static bool ep_busy_loop_on(struct eventpoll *ep)422{423return !!READ_ONCE(ep->busy_poll_usecs) ||424READ_ONCE(ep->prefer_busy_poll) ||425net_busy_loop_on();426}427428static bool ep_busy_loop_end(void *p, unsigned long start_time)429{430struct eventpoll *ep = p;431432return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);433}434435/*436* Busy poll if globally on and supporting sockets found && no events,437* busy loop will return if need_resched or ep_events_available.438*439* we must do our busy polling with irqs enabled440*/441static bool ep_busy_loop(struct eventpoll *ep)442{443unsigned int napi_id = READ_ONCE(ep->napi_id);444u16 budget = READ_ONCE(ep->busy_poll_budget);445bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);446447if (!budget)448budget = BUSY_POLL_BUDGET;449450if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {451napi_busy_loop(napi_id, ep_busy_loop_end,452ep, prefer_busy_poll, budget);453if (ep_events_available(ep))454return true;455/*456* Busy poll timed out. Drop NAPI ID for now, we can add457* it back in when we have moved a socket with a valid NAPI458* ID onto the ready list.459*/460if (prefer_busy_poll)461napi_resume_irqs(napi_id);462ep->napi_id = 0;463return false;464}465return false;466}467468/*469* Set epoll busy poll NAPI ID from sk.470*/471static inline void ep_set_busy_poll_napi_id(struct epitem *epi)472{473struct eventpoll *ep = epi->ep;474unsigned int napi_id;475struct socket *sock;476struct sock *sk;477478if (!ep_busy_loop_on(ep))479return;480481sock = sock_from_file(epi->ffd.file);482if (!sock)483return;484485sk = sock->sk;486if (!sk)487return;488489napi_id = READ_ONCE(sk->sk_napi_id);490491/* Non-NAPI IDs can be rejected492* or493* Nothing to do if we already have this ID494*/495if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)496return;497498/* record NAPI ID for use in next busy poll */499ep->napi_id = napi_id;500}501502static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,503unsigned long arg)504{505struct eventpoll *ep = file->private_data;506void __user *uarg = (void __user *)arg;507struct epoll_params epoll_params;508509switch (cmd) {510case EPIOCSPARAMS:511if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))512return -EFAULT;513514/* pad byte must be zero */515if (epoll_params.__pad)516return -EINVAL;517518if (epoll_params.busy_poll_usecs > S32_MAX)519return -EINVAL;520521if (epoll_params.prefer_busy_poll > 1)522return -EINVAL;523524if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&525!capable(CAP_NET_ADMIN))526return -EPERM;527528WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);529WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);530WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);531return 0;532case EPIOCGPARAMS:533memset(&epoll_params, 0, sizeof(epoll_params));534epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);535epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);536epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);537if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))538return -EFAULT;539return 0;540default:541return -ENOIOCTLCMD;542}543}544545static void ep_suspend_napi_irqs(struct eventpoll *ep)546{547unsigned int napi_id = READ_ONCE(ep->napi_id);548549if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))550napi_suspend_irqs(napi_id);551}552553static void ep_resume_napi_irqs(struct eventpoll *ep)554{555unsigned int napi_id = READ_ONCE(ep->napi_id);556557if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))558napi_resume_irqs(napi_id);559}560561#else562563static inline bool ep_busy_loop(struct eventpoll *ep)564{565return false;566}567568static inline void ep_set_busy_poll_napi_id(struct epitem *epi)569{570}571572static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,573unsigned long arg)574{575return -EOPNOTSUPP;576}577578static void ep_suspend_napi_irqs(struct eventpoll *ep)579{580}581582static void ep_resume_napi_irqs(struct eventpoll *ep)583{584}585586#endif /* CONFIG_NET_RX_BUSY_POLL */587588/*589* As described in commit 0ccf831cb lockdep: annotate epoll590* the use of wait queues used by epoll is done in a very controlled591* manner. Wake ups can nest inside each other, but are never done592* with the same locking. For example:593*594* dfd = socket(...);595* efd1 = epoll_create();596* efd2 = epoll_create();597* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);598* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);599*600* When a packet arrives to the device underneath "dfd", the net code will601* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a602* callback wakeup entry on that queue, and the wake_up() performed by the603* "dfd" net code will end up in ep_poll_callback(). At this point epoll604* (efd1) notices that it may have some event ready, so it needs to wake up605* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()606* that ends up in another wake_up(), after having checked about the607* recursion constraints. That are, no more than EP_MAX_NESTS, to avoid608* stack blasting.609*610* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle611* this special case of epoll.612*/613#ifdef CONFIG_DEBUG_LOCK_ALLOC614615static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,616unsigned pollflags)617{618struct eventpoll *ep_src;619unsigned long flags;620u8 nests = 0;621622/*623* To set the subclass or nesting level for spin_lock_irqsave_nested()624* it might be natural to create a per-cpu nest count. However, since625* we can recurse on ep->poll_wait.lock, and a non-raw spinlock can626* schedule() in the -rt kernel, the per-cpu variable are no longer627* protected. Thus, we are introducing a per eventpoll nest field.628* If we are not being call from ep_poll_callback(), epi is NULL and629* we are at the first level of nesting, 0. Otherwise, we are being630* called from ep_poll_callback() and if a previous wakeup source is631* not an epoll file itself, we are at depth 1 since the wakeup source632* is depth 0. If the wakeup source is a previous epoll file in the633* wakeup chain then we use its nests value and record ours as634* nests + 1. The previous epoll file nests value is stable since its635* already holding its own poll_wait.lock.636*/637if (epi) {638if ((is_file_epoll(epi->ffd.file))) {639ep_src = epi->ffd.file->private_data;640nests = ep_src->nests;641} else {642nests = 1;643}644}645spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);646ep->nests = nests + 1;647wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);648ep->nests = 0;649spin_unlock_irqrestore(&ep->poll_wait.lock, flags);650}651652#else653654static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,655__poll_t pollflags)656{657wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);658}659660#endif661662static void ep_remove_wait_queue(struct eppoll_entry *pwq)663{664wait_queue_head_t *whead;665666rcu_read_lock();667/*668* If it is cleared by POLLFREE, it should be rcu-safe.669* If we read NULL we need a barrier paired with670* smp_store_release() in ep_poll_callback(), otherwise671* we rely on whead->lock.672*/673whead = smp_load_acquire(&pwq->whead);674if (whead)675remove_wait_queue(whead, &pwq->wait);676rcu_read_unlock();677}678679/*680* This function unregisters poll callbacks from the associated file681* descriptor. Must be called with "mtx" held.682*/683static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)684{685struct eppoll_entry **p = &epi->pwqlist;686struct eppoll_entry *pwq;687688while ((pwq = *p) != NULL) {689*p = pwq->next;690ep_remove_wait_queue(pwq);691kmem_cache_free(pwq_cache, pwq);692}693}694695/* call only when ep->mtx is held */696static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)697{698return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));699}700701/* call only when ep->mtx is held */702static inline void ep_pm_stay_awake(struct epitem *epi)703{704struct wakeup_source *ws = ep_wakeup_source(epi);705706if (ws)707__pm_stay_awake(ws);708}709710static inline bool ep_has_wakeup_source(struct epitem *epi)711{712return rcu_access_pointer(epi->ws) ? true : false;713}714715/* call when ep->mtx cannot be held (ep_poll_callback) */716static inline void ep_pm_stay_awake_rcu(struct epitem *epi)717{718struct wakeup_source *ws;719720rcu_read_lock();721ws = rcu_dereference(epi->ws);722if (ws)723__pm_stay_awake(ws);724rcu_read_unlock();725}726727728/*729* ep->mutex needs to be held because we could be hit by730* eventpoll_release_file() and epoll_ctl().731*/732static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)733{734/*735* Steal the ready list, and re-init the original one to the736* empty list. Also, set ep->ovflist to NULL so that events737* happening while looping w/out locks, are not lost. We cannot738* have the poll callback to queue directly on ep->rdllist,739* because we want the "sproc" callback to be able to do it740* in a lockless way.741*/742lockdep_assert_irqs_enabled();743write_lock_irq(&ep->lock);744list_splice_init(&ep->rdllist, txlist);745WRITE_ONCE(ep->ovflist, NULL);746write_unlock_irq(&ep->lock);747}748749static void ep_done_scan(struct eventpoll *ep,750struct list_head *txlist)751{752struct epitem *epi, *nepi;753754write_lock_irq(&ep->lock);755/*756* During the time we spent inside the "sproc" callback, some757* other events might have been queued by the poll callback.758* We re-insert them inside the main ready-list here.759*/760for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;761nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {762/*763* We need to check if the item is already in the list.764* During the "sproc" callback execution time, items are765* queued into ->ovflist but the "txlist" might already766* contain them, and the list_splice() below takes care of them.767*/768if (!ep_is_linked(epi)) {769/*770* ->ovflist is LIFO, so we have to reverse it in order771* to keep in FIFO.772*/773list_add(&epi->rdllink, &ep->rdllist);774ep_pm_stay_awake(epi);775}776}777/*778* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after779* releasing the lock, events will be queued in the normal way inside780* ep->rdllist.781*/782WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);783784/*785* Quickly re-inject items left on "txlist".786*/787list_splice(txlist, &ep->rdllist);788__pm_relax(ep->ws);789790if (!list_empty(&ep->rdllist)) {791if (waitqueue_active(&ep->wq))792wake_up(&ep->wq);793}794795write_unlock_irq(&ep->lock);796}797798static void ep_get(struct eventpoll *ep)799{800refcount_inc(&ep->refcount);801}802803/*804* Returns true if the event poll can be disposed805*/806static bool ep_refcount_dec_and_test(struct eventpoll *ep)807{808if (!refcount_dec_and_test(&ep->refcount))809return false;810811WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));812return true;813}814815static void ep_free(struct eventpoll *ep)816{817ep_resume_napi_irqs(ep);818mutex_destroy(&ep->mtx);819free_uid(ep->user);820wakeup_source_unregister(ep->ws);821kfree(ep);822}823824/*825* Removes a "struct epitem" from the eventpoll RB tree and deallocates826* all the associated resources. Must be called with "mtx" held.827* If the dying flag is set, do the removal only if force is true.828* This prevents ep_clear_and_put() from dropping all the ep references829* while running concurrently with eventpoll_release_file().830* Returns true if the eventpoll can be disposed.831*/832static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)833{834struct file *file = epi->ffd.file;835struct epitems_head *to_free;836struct hlist_head *head;837838lockdep_assert_irqs_enabled();839840/*841* Removes poll wait queue hooks.842*/843ep_unregister_pollwait(ep, epi);844845/* Remove the current item from the list of epoll hooks */846spin_lock(&file->f_lock);847if (epi->dying && !force) {848spin_unlock(&file->f_lock);849return false;850}851852to_free = NULL;853head = file->f_ep;854if (head->first == &epi->fllink && !epi->fllink.next) {855/* See eventpoll_release() for details. */856WRITE_ONCE(file->f_ep, NULL);857if (!is_file_epoll(file)) {858struct epitems_head *v;859v = container_of(head, struct epitems_head, epitems);860if (!smp_load_acquire(&v->next))861to_free = v;862}863}864hlist_del_rcu(&epi->fllink);865spin_unlock(&file->f_lock);866free_ephead(to_free);867868rb_erase_cached(&epi->rbn, &ep->rbr);869870write_lock_irq(&ep->lock);871if (ep_is_linked(epi))872list_del_init(&epi->rdllink);873write_unlock_irq(&ep->lock);874875wakeup_source_unregister(ep_wakeup_source(epi));876/*877* At this point it is safe to free the eventpoll item. Use the union878* field epi->rcu, since we are trying to minimize the size of879* 'struct epitem'. The 'rbn' field is no longer in use. Protected by880* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make881* use of the rbn field.882*/883kfree_rcu(epi, rcu);884885percpu_counter_dec(&ep->user->epoll_watches);886return true;887}888889/*890* ep_remove variant for callers owing an additional reference to the ep891*/892static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)893{894if (__ep_remove(ep, epi, false))895WARN_ON_ONCE(ep_refcount_dec_and_test(ep));896}897898static void ep_clear_and_put(struct eventpoll *ep)899{900struct rb_node *rbp, *next;901struct epitem *epi;902903/* We need to release all tasks waiting for these file */904if (waitqueue_active(&ep->poll_wait))905ep_poll_safewake(ep, NULL, 0);906907mutex_lock(&ep->mtx);908909/*910* Walks through the whole tree by unregistering poll callbacks.911*/912for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {913epi = rb_entry(rbp, struct epitem, rbn);914915ep_unregister_pollwait(ep, epi);916cond_resched();917}918919/*920* Walks through the whole tree and try to free each "struct epitem".921* Note that ep_remove_safe() will not remove the epitem in case of a922* racing eventpoll_release_file(); the latter will do the removal.923* At this point we are sure no poll callbacks will be lingering around.924* Since we still own a reference to the eventpoll struct, the loop can't925* dispose it.926*/927for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {928next = rb_next(rbp);929epi = rb_entry(rbp, struct epitem, rbn);930ep_remove_safe(ep, epi);931cond_resched();932}933934mutex_unlock(&ep->mtx);935if (ep_refcount_dec_and_test(ep))936ep_free(ep);937}938939static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,940unsigned long arg)941{942int ret;943944if (!is_file_epoll(file))945return -EINVAL;946947switch (cmd) {948case EPIOCSPARAMS:949case EPIOCGPARAMS:950ret = ep_eventpoll_bp_ioctl(file, cmd, arg);951break;952default:953ret = -EINVAL;954break;955}956957return ret;958}959960static int ep_eventpoll_release(struct inode *inode, struct file *file)961{962struct eventpoll *ep = file->private_data;963964if (ep)965ep_clear_and_put(ep);966967return 0;968}969970static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);971972static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)973{974struct eventpoll *ep = file->private_data;975LIST_HEAD(txlist);976struct epitem *epi, *tmp;977poll_table pt;978__poll_t res = 0;979980init_poll_funcptr(&pt, NULL);981982/* Insert inside our poll wait queue */983poll_wait(file, &ep->poll_wait, wait);984985/*986* Proceed to find out if wanted events are really available inside987* the ready list.988*/989mutex_lock_nested(&ep->mtx, depth);990ep_start_scan(ep, &txlist);991list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {992if (ep_item_poll(epi, &pt, depth + 1)) {993res = EPOLLIN | EPOLLRDNORM;994break;995} else {996/*997* Item has been dropped into the ready list by the poll998* callback, but it's not actually ready, as far as999* caller requested events goes. We can remove it here.1000*/1001__pm_relax(ep_wakeup_source(epi));1002list_del_init(&epi->rdllink);1003}1004}1005ep_done_scan(ep, &txlist);1006mutex_unlock(&ep->mtx);1007return res;1008}10091010/*1011* The ffd.file pointer may be in the process of being torn down due to1012* being closed, but we may not have finished eventpoll_release() yet.1013*1014* Normally, even with the atomic_long_inc_not_zero, the file may have1015* been free'd and then gotten re-allocated to something else (since1016* files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).1017*1018* But for epoll, users hold the ep->mtx mutex, and as such any file in1019* the process of being free'd will block in eventpoll_release_file()1020* and thus the underlying file allocation will not be free'd, and the1021* file re-use cannot happen.1022*1023* For the same reason we can avoid a rcu_read_lock() around the1024* operation - 'ffd.file' cannot go away even if the refcount has1025* reached zero (but we must still not call out to ->poll() functions1026* etc).1027*/1028static struct file *epi_fget(const struct epitem *epi)1029{1030struct file *file;10311032file = epi->ffd.file;1033if (!file_ref_get(&file->f_ref))1034file = NULL;1035return file;1036}10371038/*1039* Differs from ep_eventpoll_poll() in that internal callers already have1040* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()1041* is correctly annotated.1042*/1043static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,1044int depth)1045{1046struct file *file = epi_fget(epi);1047__poll_t res;10481049/*1050* We could return EPOLLERR | EPOLLHUP or something, but let's1051* treat this more as "file doesn't exist, poll didn't happen".1052*/1053if (!file)1054return 0;10551056pt->_key = epi->event.events;1057if (!is_file_epoll(file))1058res = vfs_poll(file, pt);1059else1060res = __ep_eventpoll_poll(file, pt, depth);1061fput(file);1062return res & epi->event.events;1063}10641065static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)1066{1067return __ep_eventpoll_poll(file, wait, 0);1068}10691070#ifdef CONFIG_PROC_FS1071static void ep_show_fdinfo(struct seq_file *m, struct file *f)1072{1073struct eventpoll *ep = f->private_data;1074struct rb_node *rbp;10751076mutex_lock(&ep->mtx);1077for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {1078struct epitem *epi = rb_entry(rbp, struct epitem, rbn);1079struct inode *inode = file_inode(epi->ffd.file);10801081seq_printf(m, "tfd: %8d events: %8x data: %16llx "1082" pos:%lli ino:%lx sdev:%x\n",1083epi->ffd.fd, epi->event.events,1084(long long)epi->event.data,1085(long long)epi->ffd.file->f_pos,1086inode->i_ino, inode->i_sb->s_dev);1087if (seq_has_overflowed(m))1088break;1089}1090mutex_unlock(&ep->mtx);1091}1092#endif10931094/* File callbacks that implement the eventpoll file behaviour */1095static const struct file_operations eventpoll_fops = {1096#ifdef CONFIG_PROC_FS1097.show_fdinfo = ep_show_fdinfo,1098#endif1099.release = ep_eventpoll_release,1100.poll = ep_eventpoll_poll,1101.llseek = noop_llseek,1102.unlocked_ioctl = ep_eventpoll_ioctl,1103.compat_ioctl = compat_ptr_ioctl,1104};11051106/*1107* This is called from eventpoll_release() to unlink files from the eventpoll1108* interface. We need to have this facility to cleanup correctly files that are1109* closed without being removed from the eventpoll interface.1110*/1111void eventpoll_release_file(struct file *file)1112{1113struct eventpoll *ep;1114struct epitem *epi;1115bool dispose;11161117/*1118* Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from1119* touching the epitems list before eventpoll_release_file() can access1120* the ep->mtx.1121*/1122again:1123spin_lock(&file->f_lock);1124if (file->f_ep && file->f_ep->first) {1125epi = hlist_entry(file->f_ep->first, struct epitem, fllink);1126epi->dying = true;1127spin_unlock(&file->f_lock);11281129/*1130* ep access is safe as we still own a reference to the ep1131* struct1132*/1133ep = epi->ep;1134mutex_lock(&ep->mtx);1135dispose = __ep_remove(ep, epi, true);1136mutex_unlock(&ep->mtx);11371138if (dispose && ep_refcount_dec_and_test(ep))1139ep_free(ep);1140goto again;1141}1142spin_unlock(&file->f_lock);1143}11441145static int ep_alloc(struct eventpoll **pep)1146{1147struct eventpoll *ep;11481149ep = kzalloc(sizeof(*ep), GFP_KERNEL);1150if (unlikely(!ep))1151return -ENOMEM;11521153mutex_init(&ep->mtx);1154rwlock_init(&ep->lock);1155init_waitqueue_head(&ep->wq);1156init_waitqueue_head(&ep->poll_wait);1157INIT_LIST_HEAD(&ep->rdllist);1158ep->rbr = RB_ROOT_CACHED;1159ep->ovflist = EP_UNACTIVE_PTR;1160ep->user = get_current_user();1161refcount_set(&ep->refcount, 1);11621163*pep = ep;11641165return 0;1166}11671168/*1169* Search the file inside the eventpoll tree. The RB tree operations1170* are protected by the "mtx" mutex, and ep_find() must be called with1171* "mtx" held.1172*/1173static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)1174{1175int kcmp;1176struct rb_node *rbp;1177struct epitem *epi, *epir = NULL;1178struct epoll_filefd ffd;11791180ep_set_ffd(&ffd, file, fd);1181for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {1182epi = rb_entry(rbp, struct epitem, rbn);1183kcmp = ep_cmp_ffd(&ffd, &epi->ffd);1184if (kcmp > 0)1185rbp = rbp->rb_right;1186else if (kcmp < 0)1187rbp = rbp->rb_left;1188else {1189epir = epi;1190break;1191}1192}11931194return epir;1195}11961197#ifdef CONFIG_KCMP1198static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)1199{1200struct rb_node *rbp;1201struct epitem *epi;12021203for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {1204epi = rb_entry(rbp, struct epitem, rbn);1205if (epi->ffd.fd == tfd) {1206if (toff == 0)1207return epi;1208else1209toff--;1210}1211cond_resched();1212}12131214return NULL;1215}12161217struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,1218unsigned long toff)1219{1220struct file *file_raw;1221struct eventpoll *ep;1222struct epitem *epi;12231224if (!is_file_epoll(file))1225return ERR_PTR(-EINVAL);12261227ep = file->private_data;12281229mutex_lock(&ep->mtx);1230epi = ep_find_tfd(ep, tfd, toff);1231if (epi)1232file_raw = epi->ffd.file;1233else1234file_raw = ERR_PTR(-ENOENT);1235mutex_unlock(&ep->mtx);12361237return file_raw;1238}1239#endif /* CONFIG_KCMP */12401241/*1242* Adds a new entry to the tail of the list in a lockless way, i.e.1243* multiple CPUs are allowed to call this function concurrently.1244*1245* Beware: it is necessary to prevent any other modifications of the1246* existing list until all changes are completed, in other words1247* concurrent list_add_tail_lockless() calls should be protected1248* with a read lock, where write lock acts as a barrier which1249* makes sure all list_add_tail_lockless() calls are fully1250* completed.1251*1252* Also an element can be locklessly added to the list only in one1253* direction i.e. either to the tail or to the head, otherwise1254* concurrent access will corrupt the list.1255*1256* Return: %false if element has been already added to the list, %true1257* otherwise.1258*/1259static inline bool list_add_tail_lockless(struct list_head *new,1260struct list_head *head)1261{1262struct list_head *prev;12631264/*1265* This is simple 'new->next = head' operation, but cmpxchg()1266* is used in order to detect that same element has been just1267* added to the list from another CPU: the winner observes1268* new->next == new.1269*/1270if (!try_cmpxchg(&new->next, &new, head))1271return false;12721273/*1274* Initially ->next of a new element must be updated with the head1275* (we are inserting to the tail) and only then pointers are atomically1276* exchanged. XCHG guarantees memory ordering, thus ->next should be1277* updated before pointers are actually swapped and pointers are1278* swapped before prev->next is updated.1279*/12801281prev = xchg(&head->prev, new);12821283/*1284* It is safe to modify prev->next and new->prev, because a new element1285* is added only to the tail and new->next is updated before XCHG.1286*/12871288prev->next = new;1289new->prev = prev;12901291return true;1292}12931294/*1295* Chains a new epi entry to the tail of the ep->ovflist in a lockless way,1296* i.e. multiple CPUs are allowed to call this function concurrently.1297*1298* Return: %false if epi element has been already chained, %true otherwise.1299*/1300static inline bool chain_epi_lockless(struct epitem *epi)1301{1302struct eventpoll *ep = epi->ep;13031304/* Fast preliminary check */1305if (epi->next != EP_UNACTIVE_PTR)1306return false;13071308/* Check that the same epi has not been just chained from another CPU */1309if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)1310return false;13111312/* Atomically exchange tail */1313epi->next = xchg(&ep->ovflist, epi);13141315return true;1316}13171318/*1319* This is the callback that is passed to the wait queue wakeup1320* mechanism. It is called by the stored file descriptors when they1321* have events to report.1322*1323* This callback takes a read lock in order not to contend with concurrent1324* events from another file descriptor, thus all modifications to ->rdllist1325* or ->ovflist are lockless. Read lock is paired with the write lock from1326* ep_start/done_scan(), which stops all list modifications and guarantees1327* that lists state is seen correctly.1328*1329* Another thing worth to mention is that ep_poll_callback() can be called1330* concurrently for the same @epi from different CPUs if poll table was inited1331* with several wait queues entries. Plural wakeup from different CPUs of a1332* single wait queue is serialized by wq.lock, but the case when multiple wait1333* queues are used should be detected accordingly. This is detected using1334* cmpxchg() operation.1335*/1336static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)1337{1338int pwake = 0;1339struct epitem *epi = ep_item_from_wait(wait);1340struct eventpoll *ep = epi->ep;1341__poll_t pollflags = key_to_poll(key);1342unsigned long flags;1343int ewake = 0;13441345read_lock_irqsave(&ep->lock, flags);13461347ep_set_busy_poll_napi_id(epi);13481349/*1350* If the event mask does not contain any poll(2) event, we consider the1351* descriptor to be disabled. This condition is likely the effect of the1352* EPOLLONESHOT bit that disables the descriptor when an event is received,1353* until the next EPOLL_CTL_MOD will be issued.1354*/1355if (!(epi->event.events & ~EP_PRIVATE_BITS))1356goto out_unlock;13571358/*1359* Check the events coming with the callback. At this stage, not1360* every device reports the events in the "key" parameter of the1361* callback. We need to be able to handle both cases here, hence the1362* test for "key" != NULL before the event match test.1363*/1364if (pollflags && !(pollflags & epi->event.events))1365goto out_unlock;13661367/*1368* If we are transferring events to userspace, we can hold no locks1369* (because we're accessing user memory, and because of linux f_op->poll()1370* semantics). All the events that happen during that period of time are1371* chained in ep->ovflist and requeued later on.1372*/1373if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {1374if (chain_epi_lockless(epi))1375ep_pm_stay_awake_rcu(epi);1376} else if (!ep_is_linked(epi)) {1377/* In the usual case, add event to ready list. */1378if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))1379ep_pm_stay_awake_rcu(epi);1380}13811382/*1383* Wake up ( if active ) both the eventpoll wait list and the ->poll()1384* wait list.1385*/1386if (waitqueue_active(&ep->wq)) {1387if ((epi->event.events & EPOLLEXCLUSIVE) &&1388!(pollflags & POLLFREE)) {1389switch (pollflags & EPOLLINOUT_BITS) {1390case EPOLLIN:1391if (epi->event.events & EPOLLIN)1392ewake = 1;1393break;1394case EPOLLOUT:1395if (epi->event.events & EPOLLOUT)1396ewake = 1;1397break;1398case 0:1399ewake = 1;1400break;1401}1402}1403if (sync)1404wake_up_sync(&ep->wq);1405else1406wake_up(&ep->wq);1407}1408if (waitqueue_active(&ep->poll_wait))1409pwake++;14101411out_unlock:1412read_unlock_irqrestore(&ep->lock, flags);14131414/* We have to call this outside the lock */1415if (pwake)1416ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);14171418if (!(epi->event.events & EPOLLEXCLUSIVE))1419ewake = 1;14201421if (pollflags & POLLFREE) {1422/*1423* If we race with ep_remove_wait_queue() it can miss1424* ->whead = NULL and do another remove_wait_queue() after1425* us, so we can't use __remove_wait_queue().1426*/1427list_del_init(&wait->entry);1428/*1429* ->whead != NULL protects us from the race with1430* ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()1431* takes whead->lock held by the caller. Once we nullify it,1432* nothing protects ep/epi or even wait.1433*/1434smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);1435}14361437return ewake;1438}14391440/*1441* This is the callback that is used to add our wait queue to the1442* target file wakeup lists.1443*/1444static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,1445poll_table *pt)1446{1447struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);1448struct epitem *epi = epq->epi;1449struct eppoll_entry *pwq;14501451if (unlikely(!epi)) // an earlier allocation has failed1452return;14531454pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);1455if (unlikely(!pwq)) {1456epq->epi = NULL;1457return;1458}14591460init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);1461pwq->whead = whead;1462pwq->base = epi;1463if (epi->event.events & EPOLLEXCLUSIVE)1464add_wait_queue_exclusive(whead, &pwq->wait);1465else1466add_wait_queue(whead, &pwq->wait);1467pwq->next = epi->pwqlist;1468epi->pwqlist = pwq;1469}14701471static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)1472{1473int kcmp;1474struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;1475struct epitem *epic;1476bool leftmost = true;14771478while (*p) {1479parent = *p;1480epic = rb_entry(parent, struct epitem, rbn);1481kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);1482if (kcmp > 0) {1483p = &parent->rb_right;1484leftmost = false;1485} else1486p = &parent->rb_left;1487}1488rb_link_node(&epi->rbn, parent, p);1489rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);1490}1491149214931494#define PATH_ARR_SIZE 51495/*1496* These are the number paths of length 1 to 5, that we are allowing to emanate1497* from a single file of interest. For example, we allow 1000 paths of length1498* 1, to emanate from each file of interest. This essentially represents the1499* potential wakeup paths, which need to be limited in order to avoid massive1500* uncontrolled wakeup storms. The common use case should be a single ep which1501* is connected to n file sources. In this case each file source has 1 path1502* of length 1. Thus, the numbers below should be more than sufficient. These1503* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify1504* and delete can't add additional paths. Protected by the epnested_mutex.1505*/1506static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };1507static int path_count[PATH_ARR_SIZE];15081509static int path_count_inc(int nests)1510{1511/* Allow an arbitrary number of depth 1 paths */1512if (nests == 0)1513return 0;15141515if (++path_count[nests] > path_limits[nests])1516return -1;1517return 0;1518}15191520static void path_count_init(void)1521{1522int i;15231524for (i = 0; i < PATH_ARR_SIZE; i++)1525path_count[i] = 0;1526}15271528static int reverse_path_check_proc(struct hlist_head *refs, int depth)1529{1530int error = 0;1531struct epitem *epi;15321533if (depth > EP_MAX_NESTS) /* too deep nesting */1534return -1;15351536/* CTL_DEL can remove links here, but that can't increase our count */1537hlist_for_each_entry_rcu(epi, refs, fllink) {1538struct hlist_head *refs = &epi->ep->refs;1539if (hlist_empty(refs))1540error = path_count_inc(depth);1541else1542error = reverse_path_check_proc(refs, depth + 1);1543if (error != 0)1544break;1545}1546return error;1547}15481549/**1550* reverse_path_check - The tfile_check_list is list of epitem_head, which have1551* links that are proposed to be newly added. We need to1552* make sure that those added links don't add too many1553* paths such that we will spend all our time waking up1554* eventpoll objects.1555*1556* Return: %zero if the proposed links don't create too many paths,1557* %-1 otherwise.1558*/1559static int reverse_path_check(void)1560{1561struct epitems_head *p;15621563for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {1564int error;1565path_count_init();1566rcu_read_lock();1567error = reverse_path_check_proc(&p->epitems, 0);1568rcu_read_unlock();1569if (error)1570return error;1571}1572return 0;1573}15741575static int ep_create_wakeup_source(struct epitem *epi)1576{1577struct name_snapshot n;1578struct wakeup_source *ws;15791580if (!epi->ep->ws) {1581epi->ep->ws = wakeup_source_register(NULL, "eventpoll");1582if (!epi->ep->ws)1583return -ENOMEM;1584}15851586take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);1587ws = wakeup_source_register(NULL, n.name.name);1588release_dentry_name_snapshot(&n);15891590if (!ws)1591return -ENOMEM;1592rcu_assign_pointer(epi->ws, ws);15931594return 0;1595}15961597/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */1598static noinline void ep_destroy_wakeup_source(struct epitem *epi)1599{1600struct wakeup_source *ws = ep_wakeup_source(epi);16011602RCU_INIT_POINTER(epi->ws, NULL);16031604/*1605* wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is1606* used internally by wakeup_source_remove, too (called by1607* wakeup_source_unregister), so we cannot use call_rcu1608*/1609synchronize_rcu();1610wakeup_source_unregister(ws);1611}16121613static int attach_epitem(struct file *file, struct epitem *epi)1614{1615struct epitems_head *to_free = NULL;1616struct hlist_head *head = NULL;1617struct eventpoll *ep = NULL;16181619if (is_file_epoll(file))1620ep = file->private_data;16211622if (ep) {1623head = &ep->refs;1624} else if (!READ_ONCE(file->f_ep)) {1625allocate:1626to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);1627if (!to_free)1628return -ENOMEM;1629head = &to_free->epitems;1630}1631spin_lock(&file->f_lock);1632if (!file->f_ep) {1633if (unlikely(!head)) {1634spin_unlock(&file->f_lock);1635goto allocate;1636}1637/* See eventpoll_release() for details. */1638WRITE_ONCE(file->f_ep, head);1639to_free = NULL;1640}1641hlist_add_head_rcu(&epi->fllink, file->f_ep);1642spin_unlock(&file->f_lock);1643free_ephead(to_free);1644return 0;1645}16461647/*1648* Must be called with "mtx" held.1649*/1650static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,1651struct file *tfile, int fd, int full_check)1652{1653int error, pwake = 0;1654__poll_t revents;1655struct epitem *epi;1656struct ep_pqueue epq;1657struct eventpoll *tep = NULL;16581659if (is_file_epoll(tfile))1660tep = tfile->private_data;16611662lockdep_assert_irqs_enabled();16631664if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,1665max_user_watches) >= 0))1666return -ENOSPC;1667percpu_counter_inc(&ep->user->epoll_watches);16681669if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {1670percpu_counter_dec(&ep->user->epoll_watches);1671return -ENOMEM;1672}16731674/* Item initialization follow here ... */1675INIT_LIST_HEAD(&epi->rdllink);1676epi->ep = ep;1677ep_set_ffd(&epi->ffd, tfile, fd);1678epi->event = *event;1679epi->next = EP_UNACTIVE_PTR;16801681if (tep)1682mutex_lock_nested(&tep->mtx, 1);1683/* Add the current item to the list of active epoll hook for this file */1684if (unlikely(attach_epitem(tfile, epi) < 0)) {1685if (tep)1686mutex_unlock(&tep->mtx);1687kmem_cache_free(epi_cache, epi);1688percpu_counter_dec(&ep->user->epoll_watches);1689return -ENOMEM;1690}16911692if (full_check && !tep)1693list_file(tfile);16941695/*1696* Add the current item to the RB tree. All RB tree operations are1697* protected by "mtx", and ep_insert() is called with "mtx" held.1698*/1699ep_rbtree_insert(ep, epi);1700if (tep)1701mutex_unlock(&tep->mtx);17021703/*1704* ep_remove_safe() calls in the later error paths can't lead to1705* ep_free() as the ep file itself still holds an ep reference.1706*/1707ep_get(ep);17081709/* now check if we've created too many backpaths */1710if (unlikely(full_check && reverse_path_check())) {1711ep_remove_safe(ep, epi);1712return -EINVAL;1713}17141715if (epi->event.events & EPOLLWAKEUP) {1716error = ep_create_wakeup_source(epi);1717if (error) {1718ep_remove_safe(ep, epi);1719return error;1720}1721}17221723/* Initialize the poll table using the queue callback */1724epq.epi = epi;1725init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);17261727/*1728* Attach the item to the poll hooks and get current event bits.1729* We can safely use the file* here because its usage count has1730* been increased by the caller of this function. Note that after1731* this operation completes, the poll callback can start hitting1732* the new item.1733*/1734revents = ep_item_poll(epi, &epq.pt, 1);17351736/*1737* We have to check if something went wrong during the poll wait queue1738* install process. Namely an allocation for a wait queue failed due1739* high memory pressure.1740*/1741if (unlikely(!epq.epi)) {1742ep_remove_safe(ep, epi);1743return -ENOMEM;1744}17451746/* We have to drop the new item inside our item list to keep track of it */1747write_lock_irq(&ep->lock);17481749/* record NAPI ID of new item if present */1750ep_set_busy_poll_napi_id(epi);17511752/* If the file is already "ready" we drop it inside the ready list */1753if (revents && !ep_is_linked(epi)) {1754list_add_tail(&epi->rdllink, &ep->rdllist);1755ep_pm_stay_awake(epi);17561757/* Notify waiting tasks that events are available */1758if (waitqueue_active(&ep->wq))1759wake_up(&ep->wq);1760if (waitqueue_active(&ep->poll_wait))1761pwake++;1762}17631764write_unlock_irq(&ep->lock);17651766/* We have to call this outside the lock */1767if (pwake)1768ep_poll_safewake(ep, NULL, 0);17691770return 0;1771}17721773/*1774* Modify the interest event mask by dropping an event if the new mask1775* has a match in the current file status. Must be called with "mtx" held.1776*/1777static int ep_modify(struct eventpoll *ep, struct epitem *epi,1778const struct epoll_event *event)1779{1780int pwake = 0;1781poll_table pt;17821783lockdep_assert_irqs_enabled();17841785init_poll_funcptr(&pt, NULL);17861787/*1788* Set the new event interest mask before calling f_op->poll();1789* otherwise we might miss an event that happens between the1790* f_op->poll() call and the new event set registering.1791*/1792epi->event.events = event->events; /* need barrier below */1793epi->event.data = event->data; /* protected by mtx */1794if (epi->event.events & EPOLLWAKEUP) {1795if (!ep_has_wakeup_source(epi))1796ep_create_wakeup_source(epi);1797} else if (ep_has_wakeup_source(epi)) {1798ep_destroy_wakeup_source(epi);1799}18001801/*1802* The following barrier has two effects:1803*1804* 1) Flush epi changes above to other CPUs. This ensures1805* we do not miss events from ep_poll_callback if an1806* event occurs immediately after we call f_op->poll().1807* We need this because we did not take ep->lock while1808* changing epi above (but ep_poll_callback does take1809* ep->lock).1810*1811* 2) We also need to ensure we do not miss _past_ events1812* when calling f_op->poll(). This barrier also1813* pairs with the barrier in wq_has_sleeper (see1814* comments for wq_has_sleeper).1815*1816* This barrier will now guarantee ep_poll_callback or f_op->poll1817* (or both) will notice the readiness of an item.1818*/1819smp_mb();18201821/*1822* Get current event bits. We can safely use the file* here because1823* its usage count has been increased by the caller of this function.1824* If the item is "hot" and it is not registered inside the ready1825* list, push it inside.1826*/1827if (ep_item_poll(epi, &pt, 1)) {1828write_lock_irq(&ep->lock);1829if (!ep_is_linked(epi)) {1830list_add_tail(&epi->rdllink, &ep->rdllist);1831ep_pm_stay_awake(epi);18321833/* Notify waiting tasks that events are available */1834if (waitqueue_active(&ep->wq))1835wake_up(&ep->wq);1836if (waitqueue_active(&ep->poll_wait))1837pwake++;1838}1839write_unlock_irq(&ep->lock);1840}18411842/* We have to call this outside the lock */1843if (pwake)1844ep_poll_safewake(ep, NULL, 0);18451846return 0;1847}18481849static int ep_send_events(struct eventpoll *ep,1850struct epoll_event __user *events, int maxevents)1851{1852struct epitem *epi, *tmp;1853LIST_HEAD(txlist);1854poll_table pt;1855int res = 0;18561857/*1858* Always short-circuit for fatal signals to allow threads to make a1859* timely exit without the chance of finding more events available and1860* fetching repeatedly.1861*/1862if (fatal_signal_pending(current))1863return -EINTR;18641865init_poll_funcptr(&pt, NULL);18661867mutex_lock(&ep->mtx);1868ep_start_scan(ep, &txlist);18691870/*1871* We can loop without lock because we are passed a task private list.1872* Items cannot vanish during the loop we are holding ep->mtx.1873*/1874list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {1875struct wakeup_source *ws;1876__poll_t revents;18771878if (res >= maxevents)1879break;18801881/*1882* Activate ep->ws before deactivating epi->ws to prevent1883* triggering auto-suspend here (in case we reactive epi->ws1884* below).1885*1886* This could be rearranged to delay the deactivation of epi->ws1887* instead, but then epi->ws would temporarily be out of sync1888* with ep_is_linked().1889*/1890ws = ep_wakeup_source(epi);1891if (ws) {1892if (ws->active)1893__pm_stay_awake(ep->ws);1894__pm_relax(ws);1895}18961897list_del_init(&epi->rdllink);18981899/*1900* If the event mask intersect the caller-requested one,1901* deliver the event to userspace. Again, we are holding ep->mtx,1902* so no operations coming from userspace can change the item.1903*/1904revents = ep_item_poll(epi, &pt, 1);1905if (!revents)1906continue;19071908events = epoll_put_uevent(revents, epi->event.data, events);1909if (!events) {1910list_add(&epi->rdllink, &txlist);1911ep_pm_stay_awake(epi);1912if (!res)1913res = -EFAULT;1914break;1915}1916res++;1917if (epi->event.events & EPOLLONESHOT)1918epi->event.events &= EP_PRIVATE_BITS;1919else if (!(epi->event.events & EPOLLET)) {1920/*1921* If this file has been added with Level1922* Trigger mode, we need to insert back inside1923* the ready list, so that the next call to1924* epoll_wait() will check again the events1925* availability. At this point, no one can insert1926* into ep->rdllist besides us. The epoll_ctl()1927* callers are locked out by1928* ep_send_events() holding "mtx" and the1929* poll callback will queue them in ep->ovflist.1930*/1931list_add_tail(&epi->rdllink, &ep->rdllist);1932ep_pm_stay_awake(epi);1933}1934}1935ep_done_scan(ep, &txlist);1936mutex_unlock(&ep->mtx);19371938return res;1939}19401941static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)1942{1943struct timespec64 now;19441945if (ms < 0)1946return NULL;19471948if (!ms) {1949to->tv_sec = 0;1950to->tv_nsec = 0;1951return to;1952}19531954to->tv_sec = ms / MSEC_PER_SEC;1955to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);19561957ktime_get_ts64(&now);1958*to = timespec64_add_safe(now, *to);1959return to;1960}19611962/*1963* autoremove_wake_function, but remove even on failure to wake up, because we1964* know that default_wake_function/ttwu will only fail if the thread is already1965* woken, and in that case the ep_poll loop will remove the entry anyways, not1966* try to reuse it.1967*/1968static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,1969unsigned int mode, int sync, void *key)1970{1971int ret = default_wake_function(wq_entry, mode, sync, key);19721973/*1974* Pairs with list_empty_careful in ep_poll, and ensures future loop1975* iterations see the cause of this wakeup.1976*/1977list_del_init_careful(&wq_entry->entry);1978return ret;1979}19801981static int ep_try_send_events(struct eventpoll *ep,1982struct epoll_event __user *events, int maxevents)1983{1984int res;19851986/*1987* Try to transfer events to user space. In case we get 0 events and1988* there's still timeout left over, we go trying again in search of1989* more luck.1990*/1991res = ep_send_events(ep, events, maxevents);1992if (res > 0)1993ep_suspend_napi_irqs(ep);1994return res;1995}19961997static int ep_schedule_timeout(ktime_t *to)1998{1999if (to)2000return ktime_after(*to, ktime_get());2001else2002return 1;2003}20042005/**2006* ep_poll - Retrieves ready events, and delivers them to the caller-supplied2007* event buffer.2008*2009* @ep: Pointer to the eventpoll context.2010* @events: Pointer to the userspace buffer where the ready events should be2011* stored.2012* @maxevents: Size (in terms of number of events) of the caller event buffer.2013* @timeout: Maximum timeout for the ready events fetch operation, in2014* timespec. If the timeout is zero, the function will not block,2015* while if the @timeout ptr is NULL, the function will block2016* until at least one event has been retrieved (or an error2017* occurred).2018*2019* Return: the number of ready events which have been fetched, or an2020* error code, in case of error.2021*/2022static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,2023int maxevents, struct timespec64 *timeout)2024{2025int res, eavail, timed_out = 0;2026u64 slack = 0;2027wait_queue_entry_t wait;2028ktime_t expires, *to = NULL;20292030lockdep_assert_irqs_enabled();20312032if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {2033slack = select_estimate_accuracy(timeout);2034to = &expires;2035*to = timespec64_to_ktime(*timeout);2036} else if (timeout) {2037/*2038* Avoid the unnecessary trip to the wait queue loop, if the2039* caller specified a non blocking operation.2040*/2041timed_out = 1;2042}20432044/*2045* This call is racy: We may or may not see events that are being added2046* to the ready list under the lock (e.g., in IRQ callbacks). For cases2047* with a non-zero timeout, this thread will check the ready list under2048* lock and will add to the wait queue. For cases with a zero2049* timeout, the user by definition should not care and will have to2050* recheck again.2051*/2052eavail = ep_events_available(ep);20532054while (1) {2055if (eavail) {2056res = ep_try_send_events(ep, events, maxevents);2057if (res)2058return res;2059}20602061if (timed_out)2062return 0;20632064eavail = ep_busy_loop(ep);2065if (eavail)2066continue;20672068if (signal_pending(current))2069return -EINTR;20702071/*2072* Internally init_wait() uses autoremove_wake_function(),2073* thus wait entry is removed from the wait queue on each2074* wakeup. Why it is important? In case of several waiters2075* each new wakeup will hit the next waiter, giving it the2076* chance to harvest new event. Otherwise wakeup can be2077* lost. This is also good performance-wise, because on2078* normal wakeup path no need to call __remove_wait_queue()2079* explicitly, thus ep->lock is not taken, which halts the2080* event delivery.2081*2082* In fact, we now use an even more aggressive function that2083* unconditionally removes, because we don't reuse the wait2084* entry between loop iterations. This lets us also avoid the2085* performance issue if a process is killed, causing all of its2086* threads to wake up without being removed normally.2087*/2088init_wait(&wait);2089wait.func = ep_autoremove_wake_function;20902091write_lock_irq(&ep->lock);2092/*2093* Barrierless variant, waitqueue_active() is called under2094* the same lock on wakeup ep_poll_callback() side, so it2095* is safe to avoid an explicit barrier.2096*/2097__set_current_state(TASK_INTERRUPTIBLE);20982099/*2100* Do the final check under the lock. ep_start/done_scan()2101* plays with two lists (->rdllist and ->ovflist) and there2102* is always a race when both lists are empty for short2103* period of time although events are pending, so lock is2104* important.2105*/2106eavail = ep_events_available(ep);2107if (!eavail)2108__add_wait_queue_exclusive(&ep->wq, &wait);21092110write_unlock_irq(&ep->lock);21112112if (!eavail)2113timed_out = !ep_schedule_timeout(to) ||2114!schedule_hrtimeout_range(to, slack,2115HRTIMER_MODE_ABS);2116__set_current_state(TASK_RUNNING);21172118/*2119* We were woken up, thus go and try to harvest some events.2120* If timed out and still on the wait queue, recheck eavail2121* carefully under lock, below.2122*/2123eavail = 1;21242125if (!list_empty_careful(&wait.entry)) {2126write_lock_irq(&ep->lock);2127/*2128* If the thread timed out and is not on the wait queue,2129* it means that the thread was woken up after its2130* timeout expired before it could reacquire the lock.2131* Thus, when wait.entry is empty, it needs to harvest2132* events.2133*/2134if (timed_out)2135eavail = list_empty(&wait.entry);2136__remove_wait_queue(&ep->wq, &wait);2137write_unlock_irq(&ep->lock);2138}2139}2140}21412142/**2143* ep_loop_check_proc - verify that adding an epoll file @ep inside another2144* epoll file does not create closed loops, and2145* determine the depth of the subtree starting at @ep2146*2147* @ep: the &struct eventpoll to be currently checked.2148* @depth: Current depth of the path being checked.2149*2150* Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.2151*/2152static int ep_loop_check_proc(struct eventpoll *ep, int depth)2153{2154int result = 0;2155struct rb_node *rbp;2156struct epitem *epi;21572158if (ep->gen == loop_check_gen)2159return ep->loop_check_depth;21602161mutex_lock_nested(&ep->mtx, depth + 1);2162ep->gen = loop_check_gen;2163for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {2164epi = rb_entry(rbp, struct epitem, rbn);2165if (unlikely(is_file_epoll(epi->ffd.file))) {2166struct eventpoll *ep_tovisit;2167ep_tovisit = epi->ffd.file->private_data;2168if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)2169result = INT_MAX;2170else2171result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);2172if (result > EP_MAX_NESTS)2173break;2174} else {2175/*2176* If we've reached a file that is not associated with2177* an ep, then we need to check if the newly added2178* links are going to add too many wakeup paths. We do2179* this by adding it to the tfile_check_list, if it's2180* not already there, and calling reverse_path_check()2181* during ep_insert().2182*/2183list_file(epi->ffd.file);2184}2185}2186ep->loop_check_depth = result;2187mutex_unlock(&ep->mtx);21882189return result;2190}21912192/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */2193static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)2194{2195int result = 0;2196struct epitem *epi;21972198if (ep->gen == loop_check_gen)2199return ep->loop_check_depth;2200hlist_for_each_entry_rcu(epi, &ep->refs, fllink)2201result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);2202ep->gen = loop_check_gen;2203ep->loop_check_depth = result;2204return result;2205}22062207/**2208* ep_loop_check - Performs a check to verify that adding an epoll file (@to)2209* into another epoll file (represented by @ep) does not create2210* closed loops or too deep chains.2211*2212* @ep: Pointer to the epoll we are inserting into.2213* @to: Pointer to the epoll to be inserted.2214*2215* Return: %zero if adding the epoll @to inside the epoll @from2216* does not violate the constraints, or %-1 otherwise.2217*/2218static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)2219{2220int depth, upwards_depth;22212222inserting_into = ep;2223/*2224* Check how deep down we can get from @to, and whether it is possible2225* to loop up to @ep.2226*/2227depth = ep_loop_check_proc(to, 0);2228if (depth > EP_MAX_NESTS)2229return -1;2230/* Check how far up we can go from @ep. */2231rcu_read_lock();2232upwards_depth = ep_get_upwards_depth_proc(ep, 0);2233rcu_read_unlock();22342235return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;2236}22372238static void clear_tfile_check_list(void)2239{2240rcu_read_lock();2241while (tfile_check_list != EP_UNACTIVE_PTR) {2242struct epitems_head *head = tfile_check_list;2243tfile_check_list = head->next;2244unlist_file(head);2245}2246rcu_read_unlock();2247}22482249/*2250* Open an eventpoll file descriptor.2251*/2252static int do_epoll_create(int flags)2253{2254int error, fd;2255struct eventpoll *ep = NULL;2256struct file *file;22572258/* Check the EPOLL_* constant for consistency. */2259BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);22602261if (flags & ~EPOLL_CLOEXEC)2262return -EINVAL;2263/*2264* Create the internal data structure ("struct eventpoll").2265*/2266error = ep_alloc(&ep);2267if (error < 0)2268return error;2269/*2270* Creates all the items needed to setup an eventpoll file. That is,2271* a file structure and a free file descriptor.2272*/2273fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));2274if (fd < 0) {2275error = fd;2276goto out_free_ep;2277}2278file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,2279O_RDWR | (flags & O_CLOEXEC));2280if (IS_ERR(file)) {2281error = PTR_ERR(file);2282goto out_free_fd;2283}2284ep->file = file;2285fd_install(fd, file);2286return fd;22872288out_free_fd:2289put_unused_fd(fd);2290out_free_ep:2291ep_clear_and_put(ep);2292return error;2293}22942295SYSCALL_DEFINE1(epoll_create1, int, flags)2296{2297return do_epoll_create(flags);2298}22992300SYSCALL_DEFINE1(epoll_create, int, size)2301{2302if (size <= 0)2303return -EINVAL;23042305return do_epoll_create(0);2306}23072308#ifdef CONFIG_PM_SLEEP2309static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)2310{2311if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))2312epev->events &= ~EPOLLWAKEUP;2313}2314#else2315static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)2316{2317epev->events &= ~EPOLLWAKEUP;2318}2319#endif23202321static inline int epoll_mutex_lock(struct mutex *mutex, int depth,2322bool nonblock)2323{2324if (!nonblock) {2325mutex_lock_nested(mutex, depth);2326return 0;2327}2328if (mutex_trylock(mutex))2329return 0;2330return -EAGAIN;2331}23322333int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,2334bool nonblock)2335{2336int error;2337int full_check = 0;2338struct eventpoll *ep;2339struct epitem *epi;2340struct eventpoll *tep = NULL;23412342CLASS(fd, f)(epfd);2343if (fd_empty(f))2344return -EBADF;23452346/* Get the "struct file *" for the target file */2347CLASS(fd, tf)(fd);2348if (fd_empty(tf))2349return -EBADF;23502351/* The target file descriptor must support poll */2352if (!file_can_poll(fd_file(tf)))2353return -EPERM;23542355/* Check if EPOLLWAKEUP is allowed */2356if (ep_op_has_event(op))2357ep_take_care_of_epollwakeup(epds);23582359/*2360* We have to check that the file structure underneath the file descriptor2361* the user passed to us _is_ an eventpoll file. And also we do not permit2362* adding an epoll file descriptor inside itself.2363*/2364error = -EINVAL;2365if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))2366goto error_tgt_fput;23672368/*2369* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,2370* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.2371* Also, we do not currently supported nested exclusive wakeups.2372*/2373if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {2374if (op == EPOLL_CTL_MOD)2375goto error_tgt_fput;2376if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||2377(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))2378goto error_tgt_fput;2379}23802381/*2382* At this point it is safe to assume that the "private_data" contains2383* our own data structure.2384*/2385ep = fd_file(f)->private_data;23862387/*2388* When we insert an epoll file descriptor inside another epoll file2389* descriptor, there is the chance of creating closed loops, which are2390* better be handled here, than in more critical paths. While we are2391* checking for loops we also determine the list of files reachable2392* and hang them on the tfile_check_list, so we can check that we2393* haven't created too many possible wakeup paths.2394*2395* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when2396* the epoll file descriptor is attaching directly to a wakeup source,2397* unless the epoll file descriptor is nested. The purpose of taking the2398* 'epnested_mutex' on add is to prevent complex toplogies such as loops and2399* deep wakeup paths from forming in parallel through multiple2400* EPOLL_CTL_ADD operations.2401*/2402error = epoll_mutex_lock(&ep->mtx, 0, nonblock);2403if (error)2404goto error_tgt_fput;2405if (op == EPOLL_CTL_ADD) {2406if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||2407is_file_epoll(fd_file(tf))) {2408mutex_unlock(&ep->mtx);2409error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);2410if (error)2411goto error_tgt_fput;2412loop_check_gen++;2413full_check = 1;2414if (is_file_epoll(fd_file(tf))) {2415tep = fd_file(tf)->private_data;2416error = -ELOOP;2417if (ep_loop_check(ep, tep) != 0)2418goto error_tgt_fput;2419}2420error = epoll_mutex_lock(&ep->mtx, 0, nonblock);2421if (error)2422goto error_tgt_fput;2423}2424}24252426/*2427* Try to lookup the file inside our RB tree. Since we grabbed "mtx"2428* above, we can be sure to be able to use the item looked up by2429* ep_find() till we release the mutex.2430*/2431epi = ep_find(ep, fd_file(tf), fd);24322433error = -EINVAL;2434switch (op) {2435case EPOLL_CTL_ADD:2436if (!epi) {2437epds->events |= EPOLLERR | EPOLLHUP;2438error = ep_insert(ep, epds, fd_file(tf), fd, full_check);2439} else2440error = -EEXIST;2441break;2442case EPOLL_CTL_DEL:2443if (epi) {2444/*2445* The eventpoll itself is still alive: the refcount2446* can't go to zero here.2447*/2448ep_remove_safe(ep, epi);2449error = 0;2450} else {2451error = -ENOENT;2452}2453break;2454case EPOLL_CTL_MOD:2455if (epi) {2456if (!(epi->event.events & EPOLLEXCLUSIVE)) {2457epds->events |= EPOLLERR | EPOLLHUP;2458error = ep_modify(ep, epi, epds);2459}2460} else2461error = -ENOENT;2462break;2463}2464mutex_unlock(&ep->mtx);24652466error_tgt_fput:2467if (full_check) {2468clear_tfile_check_list();2469loop_check_gen++;2470mutex_unlock(&epnested_mutex);2471}2472return error;2473}24742475/*2476* The following function implements the controller interface for2477* the eventpoll file that enables the insertion/removal/change of2478* file descriptors inside the interest set.2479*/2480SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,2481struct epoll_event __user *, event)2482{2483struct epoll_event epds;24842485if (ep_op_has_event(op) &&2486copy_from_user(&epds, event, sizeof(struct epoll_event)))2487return -EFAULT;24882489return do_epoll_ctl(epfd, op, fd, &epds, false);2490}24912492static int ep_check_params(struct file *file, struct epoll_event __user *evs,2493int maxevents)2494{2495/* The maximum number of event must be greater than zero */2496if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)2497return -EINVAL;24982499/* Verify that the area passed by the user is writeable */2500if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))2501return -EFAULT;25022503/*2504* We have to check that the file structure underneath the fd2505* the user passed to us _is_ an eventpoll file.2506*/2507if (!is_file_epoll(file))2508return -EINVAL;25092510return 0;2511}25122513int epoll_sendevents(struct file *file, struct epoll_event __user *events,2514int maxevents)2515{2516struct eventpoll *ep;2517int ret;25182519ret = ep_check_params(file, events, maxevents);2520if (unlikely(ret))2521return ret;25222523ep = file->private_data;2524/*2525* Racy call, but that's ok - it should get retried based on2526* poll readiness anyway.2527*/2528if (ep_events_available(ep))2529return ep_try_send_events(ep, events, maxevents);2530return 0;2531}25322533/*2534* Implement the event wait interface for the eventpoll file. It is the kernel2535* part of the user space epoll_wait(2).2536*/2537static int do_epoll_wait(int epfd, struct epoll_event __user *events,2538int maxevents, struct timespec64 *to)2539{2540struct eventpoll *ep;2541int ret;25422543/* Get the "struct file *" for the eventpoll file */2544CLASS(fd, f)(epfd);2545if (fd_empty(f))2546return -EBADF;25472548ret = ep_check_params(fd_file(f), events, maxevents);2549if (unlikely(ret))2550return ret;25512552/*2553* At this point it is safe to assume that the "private_data" contains2554* our own data structure.2555*/2556ep = fd_file(f)->private_data;25572558/* Time to fish for events ... */2559return ep_poll(ep, events, maxevents, to);2560}25612562SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,2563int, maxevents, int, timeout)2564{2565struct timespec64 to;25662567return do_epoll_wait(epfd, events, maxevents,2568ep_timeout_to_timespec(&to, timeout));2569}25702571/*2572* Implement the event wait interface for the eventpoll file. It is the kernel2573* part of the user space epoll_pwait(2).2574*/2575static int do_epoll_pwait(int epfd, struct epoll_event __user *events,2576int maxevents, struct timespec64 *to,2577const sigset_t __user *sigmask, size_t sigsetsize)2578{2579int error;25802581/*2582* If the caller wants a certain signal mask to be set during the wait,2583* we apply it here.2584*/2585error = set_user_sigmask(sigmask, sigsetsize);2586if (error)2587return error;25882589error = do_epoll_wait(epfd, events, maxevents, to);25902591restore_saved_sigmask_unless(error == -EINTR);25922593return error;2594}25952596SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,2597int, maxevents, int, timeout, const sigset_t __user *, sigmask,2598size_t, sigsetsize)2599{2600struct timespec64 to;26012602return do_epoll_pwait(epfd, events, maxevents,2603ep_timeout_to_timespec(&to, timeout),2604sigmask, sigsetsize);2605}26062607SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,2608int, maxevents, const struct __kernel_timespec __user *, timeout,2609const sigset_t __user *, sigmask, size_t, sigsetsize)2610{2611struct timespec64 ts, *to = NULL;26122613if (timeout) {2614if (get_timespec64(&ts, timeout))2615return -EFAULT;2616to = &ts;2617if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))2618return -EINVAL;2619}26202621return do_epoll_pwait(epfd, events, maxevents, to,2622sigmask, sigsetsize);2623}26242625#ifdef CONFIG_COMPAT2626static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,2627int maxevents, struct timespec64 *timeout,2628const compat_sigset_t __user *sigmask,2629compat_size_t sigsetsize)2630{2631long err;26322633/*2634* If the caller wants a certain signal mask to be set during the wait,2635* we apply it here.2636*/2637err = set_compat_user_sigmask(sigmask, sigsetsize);2638if (err)2639return err;26402641err = do_epoll_wait(epfd, events, maxevents, timeout);26422643restore_saved_sigmask_unless(err == -EINTR);26442645return err;2646}26472648COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,2649struct epoll_event __user *, events,2650int, maxevents, int, timeout,2651const compat_sigset_t __user *, sigmask,2652compat_size_t, sigsetsize)2653{2654struct timespec64 to;26552656return do_compat_epoll_pwait(epfd, events, maxevents,2657ep_timeout_to_timespec(&to, timeout),2658sigmask, sigsetsize);2659}26602661COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,2662struct epoll_event __user *, events,2663int, maxevents,2664const struct __kernel_timespec __user *, timeout,2665const compat_sigset_t __user *, sigmask,2666compat_size_t, sigsetsize)2667{2668struct timespec64 ts, *to = NULL;26692670if (timeout) {2671if (get_timespec64(&ts, timeout))2672return -EFAULT;2673to = &ts;2674if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))2675return -EINVAL;2676}26772678return do_compat_epoll_pwait(epfd, events, maxevents, to,2679sigmask, sigsetsize);2680}26812682#endif26832684static int __init eventpoll_init(void)2685{2686struct sysinfo si;26872688si_meminfo(&si);2689/*2690* Allows top 4% of lomem to be allocated for epoll watches (per user).2691*/2692max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /2693EP_ITEM_COST;2694BUG_ON(max_user_watches < 0);26952696/*2697* We can have many thousands of epitems, so prevent this from2698* using an extra cache line on 64-bit (and smaller) CPUs2699*/2700BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);27012702/* Allocates slab cache used to allocate "struct epitem" items */2703epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),27040, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);27052706/* Allocates slab cache used to allocate "struct eppoll_entry" */2707pwq_cache = kmem_cache_create("eventpoll_pwq",2708sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);2709epoll_sysctls_init();27102711ephead_cache = kmem_cache_create("ep_head",2712sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);27132714return 0;2715}2716fs_initcall(eventpoll_init);271727182719