/* Copyright libuv contributors. All rights reserved.1*2* Permission is hereby granted, free of charge, to any person obtaining a copy3* of this software and associated documentation files (the "Software"), to4* deal in the Software without restriction, including without limitation the5* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or6* sell copies of the Software, and to permit persons to whom the Software is7* furnished to do so, subject to the following conditions:8*9* The above copyright notice and this permission notice shall be included in10* all copies or substantial portions of the Software.11*12* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR13* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,14* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE15* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER16* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING17* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS18* IN THE SOFTWARE.19*/2021#include "uv.h"22#include "internal.h"23#include <errno.h>24#include <sys/epoll.h>2526int uv__epoll_init(uv_loop_t* loop) {27int fd;28fd = epoll_create1(O_CLOEXEC);2930/* epoll_create1() can fail either because it's not implemented (old kernel)31* or because it doesn't understand the O_CLOEXEC flag.32*/33if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) {34fd = epoll_create(256);3536if (fd != -1)37uv__cloexec(fd, 1);38}3940loop->backend_fd = fd;41if (fd == -1)42return UV__ERR(errno);4344return 0;45}464748void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {49struct epoll_event* events;50struct epoll_event dummy;51uintptr_t i;52uintptr_t nfds;5354assert(loop->watchers != NULL);55assert(fd >= 0);5657events = (struct epoll_event*) loop->watchers[loop->nwatchers];58nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];59if (events != NULL)60/* Invalidate events with same file descriptor */61for (i = 0; i < nfds; i++)62if (events[i].data.fd == fd)63events[i].data.fd = -1;6465/* Remove the file descriptor from the epoll.66* This avoids a problem where the same file description remains open67* in another process, causing repeated junk epoll events.68*69* We pass in a dummy epoll_event, to work around a bug in old kernels.70*/71if (loop->backend_fd >= 0) {72/* Work around a bug in kernels 3.10 to 3.19 where passing a struct that73* has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.74*/75memset(&dummy, 0, sizeof(dummy));76epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);77}78}798081int uv__io_check_fd(uv_loop_t* loop, int fd) {82struct epoll_event e;83int rc;8485memset(&e, 0, sizeof(e));86e.events = POLLIN;87e.data.fd = -1;8889rc = 0;90if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))91if (errno != EEXIST)92rc = UV__ERR(errno);9394if (rc == 0)95if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))96abort();9798return rc;99}100101102void uv__io_poll(uv_loop_t* loop, int timeout) {103/* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes104* effectively infinite on 32 bits architectures. To avoid blocking105* indefinitely, we cap the timeout and poll again if necessary.106*107* Note that "30 minutes" is a simplification because it depends on108* the value of CONFIG_HZ. The magic constant assumes CONFIG_HZ=1200,109* that being the largest value I have seen in the wild (and only once.)110*/111static const int max_safe_timeout = 1789569;112static int no_epoll_pwait_cached;113static int no_epoll_wait_cached;114int no_epoll_pwait;115int no_epoll_wait;116struct epoll_event events[1024];117struct epoll_event* pe;118struct epoll_event e;119int real_timeout;120QUEUE* q;121uv__io_t* w;122sigset_t sigset;123uint64_t sigmask;124uint64_t base;125int have_signals;126int nevents;127int count;128int nfds;129int fd;130int op;131int i;132int user_timeout;133int reset_timeout;134135if (loop->nfds == 0) {136assert(QUEUE_EMPTY(&loop->watcher_queue));137return;138}139140memset(&e, 0, sizeof(e));141142while (!QUEUE_EMPTY(&loop->watcher_queue)) {143q = QUEUE_HEAD(&loop->watcher_queue);144QUEUE_REMOVE(q);145QUEUE_INIT(q);146147w = QUEUE_DATA(q, uv__io_t, watcher_queue);148assert(w->pevents != 0);149assert(w->fd >= 0);150assert(w->fd < (int) loop->nwatchers);151152e.events = w->pevents;153e.data.fd = w->fd;154155if (w->events == 0)156op = EPOLL_CTL_ADD;157else158op = EPOLL_CTL_MOD;159160/* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching161* events, skip the syscall and squelch the events after epoll_wait().162*/163if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) {164if (errno != EEXIST)165abort();166167assert(op == EPOLL_CTL_ADD);168169/* We've reactivated a file descriptor that's been watched before. */170if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e))171abort();172}173174w->events = w->pevents;175}176177sigmask = 0;178if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {179sigemptyset(&sigset);180sigaddset(&sigset, SIGPROF);181sigmask |= 1 << (SIGPROF - 1);182}183184assert(timeout >= -1);185base = loop->time;186count = 48; /* Benchmarks suggest this gives the best throughput. */187real_timeout = timeout;188189if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {190reset_timeout = 1;191user_timeout = timeout;192timeout = 0;193} else {194reset_timeout = 0;195user_timeout = 0;196}197198/* You could argue there is a dependency between these two but199* ultimately we don't care about their ordering with respect200* to one another. Worst case, we make a few system calls that201* could have been avoided because another thread already knows202* they fail with ENOSYS. Hardly the end of the world.203*/204no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached);205no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached);206207for (;;) {208/* Only need to set the provider_entry_time if timeout != 0. The function209* will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.210*/211if (timeout != 0)212uv__metrics_set_provider_entry_time(loop);213214/* See the comment for max_safe_timeout for an explanation of why215* this is necessary. Executive summary: kernel bug workaround.216*/217if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)218timeout = max_safe_timeout;219220if (sigmask != 0 && no_epoll_pwait != 0)221if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))222abort();223224if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {225nfds = epoll_pwait(loop->backend_fd,226events,227ARRAY_SIZE(events),228timeout,229&sigset);230if (nfds == -1 && errno == ENOSYS) {231uv__store_relaxed(&no_epoll_pwait_cached, 1);232no_epoll_pwait = 1;233}234} else {235nfds = epoll_wait(loop->backend_fd,236events,237ARRAY_SIZE(events),238timeout);239if (nfds == -1 && errno == ENOSYS) {240uv__store_relaxed(&no_epoll_wait_cached, 1);241no_epoll_wait = 1;242}243}244245if (sigmask != 0 && no_epoll_pwait != 0)246if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))247abort();248249/* Update loop->time unconditionally. It's tempting to skip the update when250* timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the251* operating system didn't reschedule our process while in the syscall.252*/253SAVE_ERRNO(uv__update_time(loop));254255if (nfds == 0) {256assert(timeout != -1);257258if (reset_timeout != 0) {259timeout = user_timeout;260reset_timeout = 0;261}262263if (timeout == -1)264continue;265266if (timeout == 0)267return;268269/* We may have been inside the system call for longer than |timeout|270* milliseconds so we need to update the timestamp to avoid drift.271*/272goto update_timeout;273}274275if (nfds == -1) {276if (errno == ENOSYS) {277/* epoll_wait() or epoll_pwait() failed, try the other system call. */278assert(no_epoll_wait == 0 || no_epoll_pwait == 0);279continue;280}281282if (errno != EINTR)283abort();284285if (reset_timeout != 0) {286timeout = user_timeout;287reset_timeout = 0;288}289290if (timeout == -1)291continue;292293if (timeout == 0)294return;295296/* Interrupted by a signal. Update timeout and poll again. */297goto update_timeout;298}299300have_signals = 0;301nevents = 0;302303{304/* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */305union {306struct epoll_event* events;307uv__io_t* watchers;308} x;309310x.events = events;311assert(loop->watchers != NULL);312loop->watchers[loop->nwatchers] = x.watchers;313loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;314}315316for (i = 0; i < nfds; i++) {317pe = events + i;318fd = pe->data.fd;319320/* Skip invalidated events, see uv__platform_invalidate_fd */321if (fd == -1)322continue;323324assert(fd >= 0);325assert((unsigned) fd < loop->nwatchers);326327w = loop->watchers[fd];328329if (w == NULL) {330/* File descriptor that we've stopped watching, disarm it.331*332* Ignore all errors because we may be racing with another thread333* when the file descriptor is closed.334*/335epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe);336continue;337}338339/* Give users only events they're interested in. Prevents spurious340* callbacks when previous callback invocation in this loop has stopped341* the current watcher. Also, filters out events that users has not342* requested us to watch.343*/344pe->events &= w->pevents | POLLERR | POLLHUP;345346/* Work around an epoll quirk where it sometimes reports just the347* EPOLLERR or EPOLLHUP event. In order to force the event loop to348* move forward, we merge in the read/write events that the watcher349* is interested in; uv__read() and uv__write() will then deal with350* the error or hangup in the usual fashion.351*352* Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user353* reads the available data, calls uv_read_stop(), then sometime later354* calls uv_read_start() again. By then, libuv has forgotten about the355* hangup and the kernel won't report EPOLLIN again because there's356* nothing left to read. If anything, libuv is to blame here. The357* current hack is just a quick bandaid; to properly fix it, libuv358* needs to remember the error/hangup event. We should get that for359* free when we switch over to edge-triggered I/O.360*/361if (pe->events == POLLERR || pe->events == POLLHUP)362pe->events |=363w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);364365if (pe->events != 0) {366/* Run signal watchers last. This also affects child process watchers367* because those are implemented in terms of signal watchers.368*/369if (w == &loop->signal_io_watcher) {370have_signals = 1;371} else {372uv__metrics_update_idle_time(loop);373w->cb(loop, w, pe->events);374}375376nevents++;377}378}379380if (reset_timeout != 0) {381timeout = user_timeout;382reset_timeout = 0;383}384385if (have_signals != 0) {386uv__metrics_update_idle_time(loop);387loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);388}389390loop->watchers[loop->nwatchers] = NULL;391loop->watchers[loop->nwatchers + 1] = NULL;392393if (have_signals != 0)394return; /* Event loop should cycle now so don't poll again. */395396if (nevents != 0) {397if (nfds == ARRAY_SIZE(events) && --count != 0) {398/* Poll for more events but don't block this time. */399timeout = 0;400continue;401}402return;403}404405if (timeout == 0)406return;407408if (timeout == -1)409continue;410411update_timeout:412assert(timeout > 0);413414real_timeout -= (loop->time - base);415if (real_timeout <= 0)416return;417418timeout = real_timeout;419}420}421422423424