CoCalc -- os_linux.cpp

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/hotspot/os/linux/os_linux.cpp
⁴⁰⁹⁵¹ views
1
/*
2
 * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.
8
 *
9
 * This code is distributed in the hope that it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
 * version 2 for more details (a copy is included in the LICENSE file that
13
 * accompanied this code).
14
 *
15
 * You should have received a copy of the GNU General Public License version
16
 * 2 along with this work; if not, write to the Free Software Foundation,
17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
 *
19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
 * or visit www.oracle.com if you need additional information or have any
21
 * questions.
22
 *
23
 */
24

25
// no precompiled headers
26
#include "jvm.h"
27
#include "classfile/vmSymbols.hpp"
28
#include "code/icBuffer.hpp"
29
#include "code/vtableStubs.hpp"
30
#include "compiler/compileBroker.hpp"
31
#include "compiler/disassembler.hpp"
32
#include "interpreter/interpreter.hpp"
33
#include "jvmtifiles/jvmti.h"
34
#include "logging/log.hpp"
35
#include "logging/logStream.hpp"
36
#include "memory/allocation.inline.hpp"
37
#include "oops/oop.inline.hpp"
38
#include "os_linux.inline.hpp"
39
#include "os_posix.inline.hpp"
40
#include "os_share_linux.hpp"
41
#include "osContainer_linux.hpp"
42
#include "prims/jniFastGetField.hpp"
43
#include "prims/jvm_misc.hpp"
44
#include "runtime/arguments.hpp"
45
#include "runtime/atomic.hpp"
46
#include "runtime/globals.hpp"
47
#include "runtime/globals_extension.hpp"
48
#include "runtime/interfaceSupport.inline.hpp"
49
#include "runtime/init.hpp"
50
#include "runtime/java.hpp"
51
#include "runtime/javaCalls.hpp"
52
#include "runtime/mutexLocker.hpp"
53
#include "runtime/objectMonitor.hpp"
54
#include "runtime/osThread.hpp"
55
#include "runtime/perfMemory.hpp"
56
#include "runtime/sharedRuntime.hpp"
57
#include "runtime/statSampler.hpp"
58
#include "runtime/stubRoutines.hpp"
59
#include "runtime/thread.inline.hpp"
60
#include "runtime/threadCritical.hpp"
61
#include "runtime/threadSMR.hpp"
62
#include "runtime/timer.hpp"
63
#include "runtime/vm_version.hpp"
64
#include "signals_posix.hpp"
65
#include "semaphore_posix.hpp"
66
#include "services/memTracker.hpp"
67
#include "services/runtimeService.hpp"
68
#include "utilities/align.hpp"
69
#include "utilities/decoder.hpp"
70
#include "utilities/defaultStream.hpp"
71
#include "utilities/events.hpp"
72
#include "utilities/elfFile.hpp"
73
#include "utilities/growableArray.hpp"
74
#include "utilities/macros.hpp"
75
#include "utilities/powerOfTwo.hpp"
76
#include "utilities/vmError.hpp"
77

78
// put OS-includes here
79
# include <sys/types.h>
80
# include <sys/mman.h>
81
# include <sys/stat.h>
82
# include <sys/select.h>
83
# include <pthread.h>
84
# include <signal.h>
85
# include <endian.h>
86
# include <errno.h>
87
# include <dlfcn.h>
88
# include <stdio.h>
89
# include <unistd.h>
90
# include <sys/resource.h>
91
# include <pthread.h>
92
# include <sys/stat.h>
93
# include <sys/time.h>
94
# include <sys/times.h>
95
# include <sys/utsname.h>
96
# include <sys/socket.h>
97
# include <pwd.h>
98
# include <poll.h>
99
# include <fcntl.h>
100
# include <string.h>
101
# include <syscall.h>
102
# include <sys/sysinfo.h>
103
# include <sys/ipc.h>
104
# include <sys/shm.h>
105
# include <link.h>
106
# include <stdint.h>
107
# include <inttypes.h>
108
# include <sys/ioctl.h>
109
# include <linux/elf-em.h>
110
#ifdef __GLIBC__
111
# include <malloc.h>
112
#endif
113

114
#ifndef _GNU_SOURCE
115
  #define _GNU_SOURCE
116
  #include <sched.h>
117
  #undef _GNU_SOURCE
118
#else
119
  #include <sched.h>
120
#endif
121

122
// if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling
123
// getrusage() is prepared to handle the associated failure.
124
#ifndef RUSAGE_THREAD
125
  #define RUSAGE_THREAD   (1)               /* only the calling thread */
126
#endif
127

128
#define MAX_PATH    (2 * K)
129

130
#define MAX_SECS 100000000
131

132
// for timer info max values which include all bits
133
#define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
134

135
#if defined(MUSL_LIBC) || defined(__ANDROID__)
136
// dlvsym is not a part of POSIX
137
// and musl libc doesn't implement it.
138
static void *dlvsym(void *handle,
139
                    const char *symbol,
140
                    const char *version) {
141
   // load the latest version of symbol
142
   return dlsym(handle, symbol);
143
}
144
#endif
145

146
enum CoredumpFilterBit {
147
  FILE_BACKED_PVT_BIT = 1 << 2,
148
  FILE_BACKED_SHARED_BIT = 1 << 3,
149
  LARGEPAGES_BIT = 1 << 6,
150
  DAX_SHARED_BIT = 1 << 8
151
};
152

153
////////////////////////////////////////////////////////////////////////////////
154
// global variables
155
julong os::Linux::_physical_memory = 0;
156

157
address   os::Linux::_initial_thread_stack_bottom = NULL;
158
uintptr_t os::Linux::_initial_thread_stack_size   = 0;
159

160
int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = NULL;
161
int (*os::Linux::_pthread_setname_np)(pthread_t, const char*) = NULL;
162
pthread_t os::Linux::_main_thread;
163
int os::Linux::_page_size = -1;
164
bool os::Linux::_supports_fast_thread_cpu_time = false;
165
const char * os::Linux::_libc_version = NULL;
166
const char * os::Linux::_libpthread_version = NULL;
167
size_t os::Linux::_default_large_page_size = 0;
168

169
#ifdef __GLIBC__
170
os::Linux::mallinfo_func_t os::Linux::_mallinfo = NULL;
171
os::Linux::mallinfo2_func_t os::Linux::_mallinfo2 = NULL;
172
#endif // __GLIBC__
173

174
static jlong initial_time_count=0;
175

176
static int clock_tics_per_sec = 100;
177

178
// If the VM might have been created on the primordial thread, we need to resolve the
179
// primordial thread stack bounds and check if the current thread might be the
180
// primordial thread in places. If we know that the primordial thread is never used,
181
// such as when the VM was created by one of the standard java launchers, we can
182
// avoid this
183
static bool suppress_primordial_thread_resolution = false;
184

185
static bool read_so_path_from_maps(const char* so_name, char* buf, int buflen);
186

187
// utility functions
188

189
julong os::available_memory() {
190
  return Linux::available_memory();
191
}
192

193
julong os::Linux::available_memory() {
194
  // values in struct sysinfo are "unsigned long"
195
  struct sysinfo si;
196
  julong avail_mem;
197

198
  if (OSContainer::is_containerized()) {
199
    jlong mem_limit, mem_usage;
200
    if ((mem_limit = OSContainer::memory_limit_in_bytes()) < 1) {
201
      log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value",
202
                             mem_limit == OSCONTAINER_ERROR ? "failed" : "unlimited", mem_limit);
203
    }
204
    if (mem_limit > 0 && (mem_usage = OSContainer::memory_usage_in_bytes()) < 1) {
205
      log_debug(os, container)("container memory usage failed: " JLONG_FORMAT ", using host value", mem_usage);
206
    }
207
    if (mem_limit > 0 && mem_usage > 0 ) {
208
      avail_mem = mem_limit > mem_usage ? (julong)mem_limit - (julong)mem_usage : 0;
209
      log_trace(os)("available container memory: " JULONG_FORMAT, avail_mem);
210
      return avail_mem;
211
    }
212
  }
213

214
  sysinfo(&si);
215
  avail_mem = (julong)si.freeram * si.mem_unit;
216
  log_trace(os)("available memory: " JULONG_FORMAT, avail_mem);
217
  return avail_mem;
218
}
219

220
julong os::physical_memory() {
221
  jlong phys_mem = 0;
222
  if (OSContainer::is_containerized()) {
223
    jlong mem_limit;
224
    if ((mem_limit = OSContainer::memory_limit_in_bytes()) > 0) {
225
      log_trace(os)("total container memory: " JLONG_FORMAT, mem_limit);
226
      return mem_limit;
227
    }
228
    log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value",
229
                            mem_limit == OSCONTAINER_ERROR ? "failed" : "unlimited", mem_limit);
230
  }
231

232
  phys_mem = Linux::physical_memory();
233
  log_trace(os)("total system memory: " JLONG_FORMAT, phys_mem);
234
  return phys_mem;
235
}
236

237
static uint64_t initial_total_ticks = 0;
238
static uint64_t initial_steal_ticks = 0;
239
static bool     has_initial_tick_info = false;
240

241
static void next_line(FILE *f) {
242
  int c;
243
  do {
244
    c = fgetc(f);
245
  } while (c != '\n' && c != EOF);
246
}
247

248
bool os::Linux::get_tick_information(CPUPerfTicks* pticks, int which_logical_cpu) {
249
  FILE*         fh;
250
  uint64_t      userTicks, niceTicks, systemTicks, idleTicks;
251
  // since at least kernel 2.6 : iowait: time waiting for I/O to complete
252
  // irq: time  servicing interrupts; softirq: time servicing softirqs
253
  uint64_t      iowTicks = 0, irqTicks = 0, sirqTicks= 0;
254
  // steal (since kernel 2.6.11): time spent in other OS when running in a virtualized environment
255
  uint64_t      stealTicks = 0;
256
  // guest (since kernel 2.6.24): time spent running a virtual CPU for guest OS under the
257
  // control of the Linux kernel
258
  uint64_t      guestNiceTicks = 0;
259
  int           logical_cpu = -1;
260
  const int     required_tickinfo_count = (which_logical_cpu == -1) ? 4 : 5;
261
  int           n;
262

263
  memset(pticks, 0, sizeof(CPUPerfTicks));
264

265
  if ((fh = fopen("/proc/stat", "r")) == NULL) {
266
    return false;
267
  }
268

269
  if (which_logical_cpu == -1) {
270
    n = fscanf(fh, "cpu " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
271
            UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
272
            UINT64_FORMAT " " UINT64_FORMAT " ",
273
            &userTicks, &niceTicks, &systemTicks, &idleTicks,
274
            &iowTicks, &irqTicks, &sirqTicks,
275
            &stealTicks, &guestNiceTicks);
276
  } else {
277
    // Move to next line
278
    next_line(fh);
279

280
    // find the line for requested cpu faster to just iterate linefeeds?
281
    for (int i = 0; i < which_logical_cpu; i++) {
282
      next_line(fh);
283
    }
284

285
    n = fscanf(fh, "cpu%u " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
286
               UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
287
               UINT64_FORMAT " " UINT64_FORMAT " ",
288
               &logical_cpu, &userTicks, &niceTicks,
289
               &systemTicks, &idleTicks, &iowTicks, &irqTicks, &sirqTicks,
290
               &stealTicks, &guestNiceTicks);
291
  }
292

293
  fclose(fh);
294
  if (n < required_tickinfo_count || logical_cpu != which_logical_cpu) {
295
    return false;
296
  }
297
  pticks->used       = userTicks + niceTicks;
298
  pticks->usedKernel = systemTicks + irqTicks + sirqTicks;
299
  pticks->total      = userTicks + niceTicks + systemTicks + idleTicks +
300
                       iowTicks + irqTicks + sirqTicks + stealTicks + guestNiceTicks;
301

302
  if (n > required_tickinfo_count + 3) {
303
    pticks->steal = stealTicks;
304
    pticks->has_steal_ticks = true;
305
  } else {
306
    pticks->steal = 0;
307
    pticks->has_steal_ticks = false;
308
  }
309

310
  return true;
311
}
312

313
// Return true if user is running as root.
314

315
bool os::have_special_privileges() {
316
  static bool init = false;
317
  static bool privileges = false;
318
  if (!init) {
319
    privileges = (getuid() != geteuid()) || (getgid() != getegid());
320
    init = true;
321
  }
322
  return privileges;
323
}
324

325

326
#ifndef SYS_gettid
327
// i386 & arm: 224, ia64: 1105, amd64: 186, sparc: 143, aarch64: 178
328
  #ifdef __ia64__
329
    #define SYS_gettid 1105
330
  #else
331
    #if defined(__i386__) || defined(__arm__)
332
      #define SYS_gettid 224
333
    #else
334
      #ifdef __amd64__
335
        #define SYS_gettid 186
336
      #else
337
        #ifdef __sparc__
338
          #define SYS_gettid 143
339
        #elif defined(__arm64__) || defined(__aarch64__)
340
          #define SYS_gettid 178
341
        #else
342
          #error define gettid for the arch
343
        #endif
344
      #endif
345
    #endif
346
  #endif
347
#endif
348

349

350
// pid_t gettid()
351
//
352
// Returns the kernel thread id of the currently running thread. Kernel
353
// thread id is used to access /proc.
354
pid_t os::Linux::gettid() {
355
  int rslt = syscall(SYS_gettid);
356
  assert(rslt != -1, "must be."); // old linuxthreads implementation?
357
  return (pid_t)rslt;
358
}
359

360
// Most versions of linux have a bug where the number of processors are
361
// determined by looking at the /proc file system.  In a chroot environment,
362
// the system call returns 1.
363
static bool unsafe_chroot_detected = false;
364
static const char *unstable_chroot_error = "/proc file system not found.\n"
365
                     "Java may be unstable running multithreaded in a chroot "
366
                     "environment on Linux when /proc filesystem is not mounted.";
367

368
void os::Linux::initialize_system_info() {
369
  set_processor_count(sysconf(_SC_NPROCESSORS_CONF));
370
  if (processor_count() == 1) {
371
    pid_t pid = os::Linux::gettid();
372
    char fname[32];
373
    jio_snprintf(fname, sizeof(fname), "/proc/%d", pid);
374
    FILE *fp = fopen(fname, "r");
375
    if (fp == NULL) {
376
      unsafe_chroot_detected = true;
377
    } else {
378
      fclose(fp);
379
    }
380
  }
381
  _physical_memory = (julong)sysconf(_SC_PHYS_PAGES) * (julong)sysconf(_SC_PAGESIZE);
382
  assert(processor_count() > 0, "linux error");
383
}
384

385
void os::init_system_properties_values() {
386
  // The next steps are taken in the product version:
387
  //
388
  // Obtain the JAVA_HOME value from the location of libjvm.so.
389
  // This library should be located at:
390
  // <JAVA_HOME>/lib/{client|server}/libjvm.so.
391
  //
392
  // If "/jre/lib/" appears at the right place in the path, then we
393
  // assume libjvm.so is installed in a JDK and we use this path.
394
  //
395
  // Otherwise exit with message: "Could not create the Java virtual machine."
396
  //
397
  // The following extra steps are taken in the debugging version:
398
  //
399
  // If "/jre/lib/" does NOT appear at the right place in the path
400
  // instead of exit check for $JAVA_HOME environment variable.
401
  //
402
  // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>,
403
  // then we append a fake suffix "hotspot/libjvm.so" to this path so
404
  // it looks like libjvm.so is installed there
405
  // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so.
406
  //
407
  // Otherwise exit.
408
  //
409
  // Important note: if the location of libjvm.so changes this
410
  // code needs to be changed accordingly.
411

412
  // See ld(1):
413
  //      The linker uses the following search paths to locate required
414
  //      shared libraries:
415
  //        1: ...
416
  //        ...
417
  //        7: The default directories, normally /lib and /usr/lib.
418
#ifndef OVERRIDE_LIBPATH
419
  #if defined(AMD64) || (defined(_LP64) && defined(SPARC)) || defined(PPC64) || defined(S390)
420
    #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib"
421
  #else
422
    #define DEFAULT_LIBPATH "/lib:/usr/lib"
423
  #endif
424
#else
425
  #define DEFAULT_LIBPATH OVERRIDE_LIBPATH
426
#endif
427

428
// Base path of extensions installed on the system.
429
#define SYS_EXT_DIR     "/usr/java/packages"
430
#define EXTENSIONS_DIR  "/lib/ext"
431

432
  // Buffer that fits several sprintfs.
433
  // Note that the space for the colon and the trailing null are provided
434
  // by the nulls included by the sizeof operator.
435
  const size_t bufsize =
436
    MAX2((size_t)MAXPATHLEN,  // For dll_dir & friends.
437
         (size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir
438
  char *buf = NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
439

440
  // sysclasspath, java_home, dll_dir
441
  {
442
    char *pslash;
443
    os::jvm_path(buf, bufsize);
444

445
    // Found the full path to libjvm.so.
446
    // Now cut the path to <java_home>/jre if we can.
447
    pslash = strrchr(buf, '/');
448
    if (pslash != NULL) {
449
      *pslash = '\0';            // Get rid of /libjvm.so.
450
    }
451
    pslash = strrchr(buf, '/');
452
    if (pslash != NULL) {
453
      *pslash = '\0';            // Get rid of /{client|server|hotspot}.
454
    }
455
    Arguments::set_dll_dir(buf);
456

457
    if (pslash != NULL) {
458
      pslash = strrchr(buf, '/');
459
      if (pslash != NULL) {
460
        *pslash = '\0';        // Get rid of /lib.
461
      }
462
    }
463
    Arguments::set_java_home(buf);
464
    if (!set_boot_path('/', ':')) {
465
      vm_exit_during_initialization("Failed setting boot class path.", NULL);
466
    }
467
  }
468

469
  // Where to look for native libraries.
470
  //
471
  // Note: Due to a legacy implementation, most of the library path
472
  // is set in the launcher. This was to accomodate linking restrictions
473
  // on legacy Linux implementations (which are no longer supported).
474
  // Eventually, all the library path setting will be done here.
475
  //
476
  // However, to prevent the proliferation of improperly built native
477
  // libraries, the new path component /usr/java/packages is added here.
478
  // Eventually, all the library path setting will be done here.
479
  {
480
    // Get the user setting of LD_LIBRARY_PATH, and prepended it. It
481
    // should always exist (until the legacy problem cited above is
482
    // addressed).
483
    const char *v = ::getenv("LD_LIBRARY_PATH");
484
    const char *v_colon = ":";
485
    if (v == NULL) { v = ""; v_colon = ""; }
486
    // That's +1 for the colon and +1 for the trailing '\0'.
487
    char *ld_library_path = NEW_C_HEAP_ARRAY(char,
488
                                             strlen(v) + 1 +
489
                                             sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
490
                                             mtInternal);
491
    sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
492
    Arguments::set_library_path(ld_library_path);
493
    FREE_C_HEAP_ARRAY(char, ld_library_path);
494
  }
495

496
  // Extensions directories.
497
  sprintf(buf, "%s" EXTENSIONS_DIR ":" SYS_EXT_DIR EXTENSIONS_DIR, Arguments::get_java_home());
498
  Arguments::set_ext_dirs(buf);
499

500
  FREE_C_HEAP_ARRAY(char, buf);
501

502
#undef DEFAULT_LIBPATH
503
#undef SYS_EXT_DIR
504
#undef EXTENSIONS_DIR
505
}
506

507
////////////////////////////////////////////////////////////////////////////////
508
// breakpoint support
509

510
void os::breakpoint() {
511
  BREAKPOINT;
512
}
513

514
extern "C" void breakpoint() {
515
  // use debugger to set breakpoint here
516
}
517

518
//////////////////////////////////////////////////////////////////////////////
519
// detecting pthread library
520

521
void os::Linux::libpthread_init() {
522
#ifndef __ANDROID__
523
  // Save glibc and pthread version strings.
524
#if !defined(_CS_GNU_LIBC_VERSION) || \
525
    !defined(_CS_GNU_LIBPTHREAD_VERSION)
526
  #error "glibc too old (< 2.3.2)"
527
#endif
528

529
#ifdef MUSL_LIBC
530
  // confstr() from musl libc returns EINVAL for
531
  // _CS_GNU_LIBC_VERSION and _CS_GNU_LIBPTHREAD_VERSION
532
  os::Linux::set_libc_version("musl - unknown");
533
  os::Linux::set_libpthread_version("musl - unknown");
534
#else
535
  size_t n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
536
  assert(n > 0, "cannot retrieve glibc version");
537
  char *str = (char *)malloc(n, mtInternal);
538
  confstr(_CS_GNU_LIBC_VERSION, str, n);
539
  os::Linux::set_libc_version(str);
540

541
  n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
542
  assert(n > 0, "cannot retrieve pthread version");
543
  str = (char *)malloc(n, mtInternal);
544
  confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
545
  os::Linux::set_libpthread_version(str);
546
#endif
547
#else
548
  os::Linux::set_libpthread_version("NPTL");
549
#endif
550
}
551

552
/////////////////////////////////////////////////////////////////////////////
553
// thread stack expansion
554

555
// os::Linux::manually_expand_stack() takes care of expanding the thread
556
// stack. Note that this is normally not needed: pthread stacks allocate
557
// thread stack using mmap() without MAP_NORESERVE, so the stack is already
558
// committed. Therefore it is not necessary to expand the stack manually.
559
//
560
// Manually expanding the stack was historically needed on LinuxThreads
561
// thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
562
// it is kept to deal with very rare corner cases:
563
//
564
// For one, user may run the VM on an own implementation of threads
565
// whose stacks are - like the old LinuxThreads - implemented using
566
// mmap(MAP_GROWSDOWN).
567
//
568
// Also, this coding may be needed if the VM is running on the primordial
569
// thread. Normally we avoid running on the primordial thread; however,
570
// user may still invoke the VM on the primordial thread.
571
//
572
// The following historical comment describes the details about running
573
// on a thread stack allocated with mmap(MAP_GROWSDOWN):
574

575

576
// Force Linux kernel to expand current thread stack. If "bottom" is close
577
// to the stack guard, caller should block all signals.
578
//
579
// MAP_GROWSDOWN:
580
//   A special mmap() flag that is used to implement thread stacks. It tells
581
//   kernel that the memory region should extend downwards when needed. This
582
//   allows early versions of LinuxThreads to only mmap the first few pages
583
//   when creating a new thread. Linux kernel will automatically expand thread
584
//   stack as needed (on page faults).
585
//
586
//   However, because the memory region of a MAP_GROWSDOWN stack can grow on
587
//   demand, if a page fault happens outside an already mapped MAP_GROWSDOWN
588
//   region, it's hard to tell if the fault is due to a legitimate stack
589
//   access or because of reading/writing non-exist memory (e.g. buffer
590
//   overrun). As a rule, if the fault happens below current stack pointer,
591
//   Linux kernel does not expand stack, instead a SIGSEGV is sent to the
592
//   application (see Linux kernel fault.c).
593
//
594
//   This Linux feature can cause SIGSEGV when VM bangs thread stack for
595
//   stack overflow detection.
596
//
597
//   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
598
//   not use MAP_GROWSDOWN.
599
//
600
// To get around the problem and allow stack banging on Linux, we need to
601
// manually expand thread stack after receiving the SIGSEGV.
602
//
603
// There are two ways to expand thread stack to address "bottom", we used
604
// both of them in JVM before 1.5:
605
//   1. adjust stack pointer first so that it is below "bottom", and then
606
//      touch "bottom"
607
//   2. mmap() the page in question
608
//
609
// Now alternate signal stack is gone, it's harder to use 2. For instance,
610
// if current sp is already near the lower end of page 101, and we need to
611
// call mmap() to map page 100, it is possible that part of the mmap() frame
612
// will be placed in page 100. When page 100 is mapped, it is zero-filled.
613
// That will destroy the mmap() frame and cause VM to crash.
614
//
615
// The following code works by adjusting sp first, then accessing the "bottom"
616
// page to force a page fault. Linux kernel will then automatically expand the
617
// stack mapping.
618
//
619
// _expand_stack_to() assumes its frame size is less than page size, which
620
// should always be true if the function is not inlined.
621

622
static void NOINLINE _expand_stack_to(address bottom) {
623
  address sp;
624
  size_t size;
625
  volatile char *p;
626

627
  // Adjust bottom to point to the largest address within the same page, it
628
  // gives us a one-page buffer if alloca() allocates slightly more memory.
629
  bottom = (address)align_down((uintptr_t)bottom, os::Linux::page_size());
630
  bottom += os::Linux::page_size() - 1;
631

632
  // sp might be slightly above current stack pointer; if that's the case, we
633
  // will alloca() a little more space than necessary, which is OK. Don't use
634
  // os::current_stack_pointer(), as its result can be slightly below current
635
  // stack pointer, causing us to not alloca enough to reach "bottom".
636
  sp = (address)&sp;
637

638
  if (sp > bottom) {
639
    size = sp - bottom;
640
    p = (volatile char *)alloca(size);
641
    assert(p != NULL && p <= (volatile char *)bottom, "alloca problem?");
642
    p[0] = '\0';
643
  }
644
}
645

646
void os::Linux::expand_stack_to(address bottom) {
647
  _expand_stack_to(bottom);
648
}
649

650
bool os::Linux::manually_expand_stack(JavaThread * t, address addr) {
651
  assert(t!=NULL, "just checking");
652
  assert(t->osthread()->expanding_stack(), "expand should be set");
653

654
  if (t->is_in_usable_stack(addr)) {
655
    sigset_t mask_all, old_sigset;
656
    sigfillset(&mask_all);
657
    pthread_sigmask(SIG_SETMASK, &mask_all, &old_sigset);
658
    _expand_stack_to(addr);
659
    pthread_sigmask(SIG_SETMASK, &old_sigset, NULL);
660
    return true;
661
  }
662
  return false;
663
}
664

665
//////////////////////////////////////////////////////////////////////////////
666
// create new thread
667

668
// Thread start routine for all newly created threads
669
static void *thread_native_entry(Thread *thread) {
670

671
  thread->record_stack_base_and_size();
672

673
#ifndef __GLIBC__
674
  // Try to randomize the cache line index of hot stack frames.
675
  // This helps when threads of the same stack traces evict each other's
676
  // cache lines. The threads can be either from the same JVM instance, or
677
  // from different JVM instances. The benefit is especially true for
678
  // processors with hyperthreading technology.
679
  // This code is not needed anymore in glibc because it has MULTI_PAGE_ALIASING
680
  // and we did not see any degradation in performance without `alloca()`.
681
  static int counter = 0;
682
  int pid = os::current_process_id();
683
  void *stackmem = alloca(((pid ^ counter++) & 7) * 128);
684
  // Ensure the alloca result is used in a way that prevents the compiler from eliding it.
685
  *(char *)stackmem = 1;
686
#endif
687

688
  thread->initialize_thread_current();
689

690
  OSThread* osthread = thread->osthread();
691
  Monitor* sync = osthread->startThread_lock();
692

693
  osthread->set_thread_id(os::current_thread_id());
694

695
  log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
696
    os::current_thread_id(), (uintx) pthread_self());
697

698
  if (UseNUMA) {
699
    int lgrp_id = os::numa_get_group_id();
700
    if (lgrp_id != -1) {
701
      thread->set_lgrp_id(lgrp_id);
702
    }
703
  }
704
  // initialize signal mask for this thread
705
  PosixSignals::hotspot_sigmask(thread);
706

707
  // initialize floating point control register
708
  os::Linux::init_thread_fpu_state();
709

710
  // handshaking with parent thread
711
  {
712
    MutexLocker ml(sync, Mutex::_no_safepoint_check_flag);
713

714
    // notify parent thread
715
    osthread->set_state(INITIALIZED);
716
    sync->notify_all();
717

718
    // wait until os::start_thread()
719
    while (osthread->get_state() == INITIALIZED) {
720
      sync->wait_without_safepoint_check();
721
    }
722
  }
723

724
  assert(osthread->pthread_id() != 0, "pthread_id was not set as expected");
725

726
  // call one more level start routine
727
  thread->call_run();
728

729
  // Note: at this point the thread object may already have deleted itself.
730
  // Prevent dereferencing it from here on out.
731
  thread = NULL;
732

733
  log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
734
    os::current_thread_id(), (uintx) pthread_self());
735

736
  return 0;
737
}
738

739
// On Linux, glibc places static TLS blocks (for __thread variables) on
740
// the thread stack. This decreases the stack size actually available
741
// to threads.
742
//
743
// For large static TLS sizes, this may cause threads to malfunction due
744
// to insufficient stack space. This is a well-known issue in glibc:
745
// http://sourceware.org/bugzilla/show_bug.cgi?id=11787.
746
//
747
// As a workaround, we call a private but assumed-stable glibc function,
748
// __pthread_get_minstack() to obtain the minstack size and derive the
749
// static TLS size from it. We then increase the user requested stack
750
// size by this TLS size.
751
//
752
// Due to compatibility concerns, this size adjustment is opt-in and
753
// controlled via AdjustStackSizeForTLS.
754
typedef size_t (*GetMinStack)(const pthread_attr_t *attr);
755

756
GetMinStack _get_minstack_func = NULL;
757

758
static void get_minstack_init() {
759
  _get_minstack_func =
760
        (GetMinStack)dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
761
  log_info(os, thread)("Lookup of __pthread_get_minstack %s",
762
                       _get_minstack_func == NULL ? "failed" : "succeeded");
763
}
764

765
// Returns the size of the static TLS area glibc puts on thread stacks.
766
// The value is cached on first use, which occurs when the first thread
767
// is created during VM initialization.
768
static size_t get_static_tls_area_size(const pthread_attr_t *attr) {
769
  size_t tls_size = 0;
770
  if (_get_minstack_func != NULL) {
771
    // Obtain the pthread minstack size by calling __pthread_get_minstack.
772
    size_t minstack_size = _get_minstack_func(attr);
773

774
    // Remove non-TLS area size included in minstack size returned
775
    // by __pthread_get_minstack() to get the static TLS size.
776
    // In glibc before 2.27, minstack size includes guard_size.
777
    // In glibc 2.27 and later, guard_size is automatically added
778
    // to the stack size by pthread_create and is no longer included
779
    // in minstack size. In both cases, the guard_size is taken into
780
    // account, so there is no need to adjust the result for that.
781
    //
782
    // Although __pthread_get_minstack() is a private glibc function,
783
    // it is expected to have a stable behavior across future glibc
784
    // versions while glibc still allocates the static TLS blocks off
785
    // the stack. Following is glibc 2.28 __pthread_get_minstack():
786
    //
787
    // size_t
788
    // __pthread_get_minstack (const pthread_attr_t *attr)
789
    // {
790
    //   return GLRO(dl_pagesize) + __static_tls_size + PTHREAD_STACK_MIN;
791
    // }
792
    //
793
    //
794
    // The following 'minstack_size > os::vm_page_size() + PTHREAD_STACK_MIN'
795
    // if check is done for precaution.
796
    if (minstack_size > (size_t)os::vm_page_size() + PTHREAD_STACK_MIN) {
797
      tls_size = minstack_size - os::vm_page_size() - PTHREAD_STACK_MIN;
798
    }
799
  }
800

801
  log_info(os, thread)("Stack size adjustment for TLS is " SIZE_FORMAT,
802
                       tls_size);
803
  return tls_size;
804
}
805

806
bool os::create_thread(Thread* thread, ThreadType thr_type,
807
                       size_t req_stack_size) {
808
  assert(thread->osthread() == NULL, "caller responsible");
809

810
  // Allocate the OSThread object
811
  OSThread* osthread = new OSThread(NULL, NULL);
812
  if (osthread == NULL) {
813
    return false;
814
  }
815

816
  // set the correct thread state
817
  osthread->set_thread_type(thr_type);
818

819
  // Initial state is ALLOCATED but not INITIALIZED
820
  osthread->set_state(ALLOCATED);
821

822
  thread->set_osthread(osthread);
823

824
  // init thread attributes
825
  pthread_attr_t attr;
826
  pthread_attr_init(&attr);
827
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
828

829
  // Calculate stack size if it's not specified by caller.
830
  size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
831
  // In glibc versions prior to 2.7 the guard size mechanism
832
  // is not implemented properly. The posix standard requires adding
833
  // the size of the guard pages to the stack size, instead Linux
834
  // takes the space out of 'stacksize'. Thus we adapt the requested
835
  // stack_size by the size of the guard pages to mimick proper
836
  // behaviour. However, be careful not to end up with a size
837
  // of zero due to overflow. Don't add the guard page in that case.
838
  size_t guard_size = os::Linux::default_guard_size(thr_type);
839
  // Configure glibc guard page. Must happen before calling
840
  // get_static_tls_area_size(), which uses the guard_size.
841
  pthread_attr_setguardsize(&attr, guard_size);
842

843
  size_t stack_adjust_size = 0;
844
  if (AdjustStackSizeForTLS) {
845
    // Adjust the stack_size for on-stack TLS - see get_static_tls_area_size().
846
    stack_adjust_size += get_static_tls_area_size(&attr);
847
  } else {
848
    stack_adjust_size += guard_size;
849
  }
850

851
  stack_adjust_size = align_up(stack_adjust_size, os::vm_page_size());
852
  if (stack_size <= SIZE_MAX - stack_adjust_size) {
853
    stack_size += stack_adjust_size;
854
  }
855
  assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");
856

857
  int status = pthread_attr_setstacksize(&attr, stack_size);
858
  if (status != 0) {
859
    // pthread_attr_setstacksize() function can fail
860
    // if the stack size exceeds a system-imposed limit.
861
    assert_status(status == EINVAL, status, "pthread_attr_setstacksize");
862
    log_warning(os, thread)("The %sthread stack size specified is invalid: " SIZE_FORMAT "k",
863
                            (thr_type == compiler_thread) ? "compiler " : ((thr_type == java_thread) ? "" : "VM "),
864
                            stack_size / K);
865
    thread->set_osthread(NULL);
866
    delete osthread;
867
    return false;
868
  }
869

870
  ThreadState state;
871

872
  {
873
    pthread_t tid;
874
    int ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
875

876
    char buf[64];
877
    if (ret == 0) {
878
      log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
879
        (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
880
    } else {
881
      log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
882
        os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
883
      // Log some OS information which might explain why creating the thread failed.
884
      log_info(os, thread)("Number of threads approx. running in the VM: %d", Threads::number_of_threads());
885
      LogStream st(Log(os, thread)::info());
886
      os::Posix::print_rlimit_info(&st);
887
      os::print_memory_info(&st);
888
      os::Linux::print_proc_sys_info(&st);
889
      os::Linux::print_container_info(&st);
890
    }
891

892
    pthread_attr_destroy(&attr);
893

894
    if (ret != 0) {
895
      // Need to clean up stuff we've allocated so far
896
      thread->set_osthread(NULL);
897
      delete osthread;
898
      return false;
899
    }
900

901
    // Store pthread info into the OSThread
902
    osthread->set_pthread_id(tid);
903

904
    // Wait until child thread is either initialized or aborted
905
    {
906
      Monitor* sync_with_child = osthread->startThread_lock();
907
      MutexLocker ml(sync_with_child, Mutex::_no_safepoint_check_flag);
908
      while ((state = osthread->get_state()) == ALLOCATED) {
909
        sync_with_child->wait_without_safepoint_check();
910
      }
911
    }
912
  }
913

914
  // The thread is returned suspended (in state INITIALIZED),
915
  // and is started higher up in the call chain
916
  assert(state == INITIALIZED, "race condition");
917
  return true;
918
}
919

920
/////////////////////////////////////////////////////////////////////////////
921
// attach existing thread
922

923
// bootstrap the main thread
924
bool os::create_main_thread(JavaThread* thread) {
925
  assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread");
926
  return create_attached_thread(thread);
927
}
928

929
bool os::create_attached_thread(JavaThread* thread) {
930
#ifdef ASSERT
931
  thread->verify_not_published();
932
#endif
933

934
  // Allocate the OSThread object
935
  OSThread* osthread = new OSThread(NULL, NULL);
936

937
  if (osthread == NULL) {
938
    return false;
939
  }
940

941
  // Store pthread info into the OSThread
942
  osthread->set_thread_id(os::Linux::gettid());
943
  osthread->set_pthread_id(::pthread_self());
944

945
  // initialize floating point control register
946
  os::Linux::init_thread_fpu_state();
947

948
  // Initial thread state is RUNNABLE
949
  osthread->set_state(RUNNABLE);
950

951
  thread->set_osthread(osthread);
952

953
  if (UseNUMA) {
954
    int lgrp_id = os::numa_get_group_id();
955
    if (lgrp_id != -1) {
956
      thread->set_lgrp_id(lgrp_id);
957
    }
958
  }
959

960
  if (os::is_primordial_thread()) {
961
    // If current thread is primordial thread, its stack is mapped on demand,
962
    // see notes about MAP_GROWSDOWN. Here we try to force kernel to map
963
    // the entire stack region to avoid SEGV in stack banging.
964
    // It is also useful to get around the heap-stack-gap problem on SuSE
965
    // kernel (see 4821821 for details). We first expand stack to the top
966
    // of yellow zone, then enable stack yellow zone (order is significant,
967
    // enabling yellow zone first will crash JVM on SuSE Linux), so there
968
    // is no gap between the last two virtual memory regions.
969

970
    StackOverflow* overflow_state = thread->stack_overflow_state();
971
    address addr = overflow_state->stack_reserved_zone_base();
972
    assert(addr != NULL, "initialization problem?");
973
    assert(overflow_state->stack_available(addr) > 0, "stack guard should not be enabled");
974

975
    osthread->set_expanding_stack();
976
    os::Linux::manually_expand_stack(thread, addr);
977
    osthread->clear_expanding_stack();
978
  }
979

980
  // initialize signal mask for this thread
981
  // and save the caller's signal mask
982
  PosixSignals::hotspot_sigmask(thread);
983

984
  log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
985
    os::current_thread_id(), (uintx) pthread_self());
986

987
  return true;
988
}
989

990
void os::pd_start_thread(Thread* thread) {
991
  OSThread * osthread = thread->osthread();
992
  assert(osthread->get_state() != INITIALIZED, "just checking");
993
  Monitor* sync_with_child = osthread->startThread_lock();
994
  MutexLocker ml(sync_with_child, Mutex::_no_safepoint_check_flag);
995
  sync_with_child->notify();
996
}
997

998
// Free Linux resources related to the OSThread
999
void os::free_thread(OSThread* osthread) {
1000
  assert(osthread != NULL, "osthread not set");
1001

1002
  // We are told to free resources of the argument thread,
1003
  // but we can only really operate on the current thread.
1004
  assert(Thread::current()->osthread() == osthread,
1005
         "os::free_thread but not current thread");
1006

1007
#ifdef ASSERT
1008
  sigset_t current;
1009
  sigemptyset(&current);
1010
  pthread_sigmask(SIG_SETMASK, NULL, &current);
1011
  assert(!sigismember(&current, PosixSignals::SR_signum), "SR signal should not be blocked!");
1012
#endif
1013

1014
  // Restore caller's signal mask
1015
  sigset_t sigmask = osthread->caller_sigmask();
1016
  pthread_sigmask(SIG_SETMASK, &sigmask, NULL);
1017

1018
  delete osthread;
1019
}
1020

1021
//////////////////////////////////////////////////////////////////////////////
1022
// primordial thread
1023

1024
// Check if current thread is the primordial thread, similar to Solaris thr_main.
1025
bool os::is_primordial_thread(void) {
1026
  if (suppress_primordial_thread_resolution) {
1027
    return false;
1028
  }
1029
  char dummy;
1030
  // If called before init complete, thread stack bottom will be null.
1031
  // Can be called if fatal error occurs before initialization.
1032
  if (os::Linux::initial_thread_stack_bottom() == NULL) return false;
1033
  assert(os::Linux::initial_thread_stack_bottom() != NULL &&
1034
         os::Linux::initial_thread_stack_size()   != 0,
1035
         "os::init did not locate primordial thread's stack region");
1036
  if ((address)&dummy >= os::Linux::initial_thread_stack_bottom() &&
1037
      (address)&dummy < os::Linux::initial_thread_stack_bottom() +
1038
                        os::Linux::initial_thread_stack_size()) {
1039
    return true;
1040
  } else {
1041
    return false;
1042
  }
1043
}
1044

1045
// Find the virtual memory area that contains addr
1046
static bool find_vma(address addr, address* vma_low, address* vma_high) {
1047
  FILE *fp = fopen("/proc/self/maps", "r");
1048
  if (fp) {
1049
    address low, high;
1050
    while (!feof(fp)) {
1051
      if (fscanf(fp, "%p-%p", &low, &high) == 2) {
1052
        if (low <= addr && addr < high) {
1053
          if (vma_low)  *vma_low  = low;
1054
          if (vma_high) *vma_high = high;
1055
          fclose(fp);
1056
          return true;
1057
        }
1058
      }
1059
      for (;;) {
1060
        int ch = fgetc(fp);
1061
        if (ch == EOF || ch == (int)'\n') break;
1062
      }
1063
    }
1064
    fclose(fp);
1065
  }
1066
  return false;
1067
}
1068

1069
// Locate primordial thread stack. This special handling of primordial thread stack
1070
// is needed because pthread_getattr_np() on most (all?) Linux distros returns
1071
// bogus value for the primordial process thread. While the launcher has created
1072
// the VM in a new thread since JDK 6, we still have to allow for the use of the
1073
// JNI invocation API from a primordial thread.
1074
void os::Linux::capture_initial_stack(size_t max_size) {
1075

1076
  // max_size is either 0 (which means accept OS default for thread stacks) or
1077
  // a user-specified value known to be at least the minimum needed. If we
1078
  // are actually on the primordial thread we can make it appear that we have a
1079
  // smaller max_size stack by inserting the guard pages at that location. But we
1080
  // cannot do anything to emulate a larger stack than what has been provided by
1081
  // the OS or threading library. In fact if we try to use a stack greater than
1082
  // what is set by rlimit then we will crash the hosting process.
1083

1084
  // Maximum stack size is the easy part, get it from RLIMIT_STACK.
1085
  // If this is "unlimited" then it will be a huge value.
1086
  struct rlimit rlim;
1087
  getrlimit(RLIMIT_STACK, &rlim);
1088
  size_t stack_size = rlim.rlim_cur;
1089

1090
  // 6308388: a bug in ld.so will relocate its own .data section to the
1091
  //   lower end of primordial stack; reduce ulimit -s value a little bit
1092
  //   so we won't install guard page on ld.so's data section.
1093
  //   But ensure we don't underflow the stack size - allow 1 page spare
1094
  if (stack_size >= (size_t)(3 * page_size())) {
1095
    stack_size -= 2 * page_size();
1096
  }
1097

1098
  // Try to figure out where the stack base (top) is. This is harder.
1099
  //
1100
  // When an application is started, glibc saves the initial stack pointer in
1101
  // a global variable "__libc_stack_end", which is then used by system
1102
  // libraries. __libc_stack_end should be pretty close to stack top. The
1103
  // variable is available since the very early days. However, because it is
1104
  // a private interface, it could disappear in the future.
1105
  //
1106
  // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar
1107
  // to __libc_stack_end, it is very close to stack top, but isn't the real
1108
  // stack top. Note that /proc may not exist if VM is running as a chroot
1109
  // program, so reading /proc/<pid>/stat could fail. Also the contents of
1110
  // /proc/<pid>/stat could change in the future (though unlikely).
1111
  //
1112
  // We try __libc_stack_end first. If that doesn't work, look for
1113
  // /proc/<pid>/stat. If neither of them works, we use current stack pointer
1114
  // as a hint, which should work well in most cases.
1115

1116
  uintptr_t stack_start;
1117

1118
  // try __libc_stack_end first
1119
  uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end");
1120
  if (p && *p) {
1121
    stack_start = *p;
1122
  } else {
1123
    // see if we can get the start_stack field from /proc/self/stat
1124
    FILE *fp;
1125
    int pid;
1126
    char state;
1127
    int ppid;
1128
    int pgrp;
1129
    int session;
1130
    int nr;
1131
    int tpgrp;
1132
    unsigned long flags;
1133
    unsigned long minflt;
1134
    unsigned long cminflt;
1135
    unsigned long majflt;
1136
    unsigned long cmajflt;
1137
    unsigned long utime;
1138
    unsigned long stime;
1139
    long cutime;
1140
    long cstime;
1141
    long prio;
1142
    long nice;
1143
    long junk;
1144
    long it_real;
1145
    uintptr_t start;
1146
    uintptr_t vsize;
1147
    intptr_t rss;
1148
    uintptr_t rsslim;
1149
    uintptr_t scodes;
1150
    uintptr_t ecode;
1151
    int i;
1152

1153
    // Figure what the primordial thread stack base is. Code is inspired
1154
    // by email from Hans Boehm. /proc/self/stat begins with current pid,
1155
    // followed by command name surrounded by parentheses, state, etc.
1156
    char stat[2048];
1157
    int statlen;
1158

1159
    fp = fopen("/proc/self/stat", "r");
1160
    if (fp) {
1161
      statlen = fread(stat, 1, 2047, fp);
1162
      stat[statlen] = '\0';
1163
      fclose(fp);
1164

1165
      // Skip pid and the command string. Note that we could be dealing with
1166
      // weird command names, e.g. user could decide to rename java launcher
1167
      // to "java 1.4.2 :)", then the stat file would look like
1168
      //                1234 (java 1.4.2 :)) R ... ...
1169
      // We don't really need to know the command string, just find the last
1170
      // occurrence of ")" and then start parsing from there. See bug 4726580.
1171
      char * s = strrchr(stat, ')');
1172

1173
      i = 0;
1174
      if (s) {
1175
        // Skip blank chars
1176
        do { s++; } while (s && isspace(*s));
1177

1178
#define _UFM UINTX_FORMAT
1179
#define _DFM INTX_FORMAT
1180

1181
        //                                     1   1   1   1   1   1   1   1   1   1   2   2    2    2    2    2    2    2    2
1182
        //              3  4  5  6  7  8   9   0   1   2   3   4   5   6   7   8   9   0   1    2    3    4    5    6    7    8
1183
        i = sscanf(s, "%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld " _UFM _UFM _DFM _UFM _UFM _UFM _UFM,
1184
                   &state,          // 3  %c
1185
                   &ppid,           // 4  %d
1186
                   &pgrp,           // 5  %d
1187
                   &session,        // 6  %d
1188
                   &nr,             // 7  %d
1189
                   &tpgrp,          // 8  %d
1190
                   &flags,          // 9  %lu
1191
                   &minflt,         // 10 %lu
1192
                   &cminflt,        // 11 %lu
1193
                   &majflt,         // 12 %lu
1194
                   &cmajflt,        // 13 %lu
1195
                   &utime,          // 14 %lu
1196
                   &stime,          // 15 %lu
1197
                   &cutime,         // 16 %ld
1198
                   &cstime,         // 17 %ld
1199
                   &prio,           // 18 %ld
1200
                   &nice,           // 19 %ld
1201
                   &junk,           // 20 %ld
1202
                   &it_real,        // 21 %ld
1203
                   &start,          // 22 UINTX_FORMAT
1204
                   &vsize,          // 23 UINTX_FORMAT
1205
                   &rss,            // 24 INTX_FORMAT
1206
                   &rsslim,         // 25 UINTX_FORMAT
1207
                   &scodes,         // 26 UINTX_FORMAT
1208
                   &ecode,          // 27 UINTX_FORMAT
1209
                   &stack_start);   // 28 UINTX_FORMAT
1210
      }
1211

1212
#undef _UFM
1213
#undef _DFM
1214

1215
      if (i != 28 - 2) {
1216
        assert(false, "Bad conversion from /proc/self/stat");
1217
        // product mode - assume we are the primordial thread, good luck in the
1218
        // embedded case.
1219
        warning("Can't detect primordial thread stack location - bad conversion");
1220
        stack_start = (uintptr_t) &rlim;
1221
      }
1222
    } else {
1223
      // For some reason we can't open /proc/self/stat (for example, running on
1224
      // FreeBSD with a Linux emulator, or inside chroot), this should work for
1225
      // most cases, so don't abort:
1226
      warning("Can't detect primordial thread stack location - no /proc/self/stat");
1227
      stack_start = (uintptr_t) &rlim;
1228
    }
1229
  }
1230

1231
  // Now we have a pointer (stack_start) very close to the stack top, the
1232
  // next thing to do is to figure out the exact location of stack top. We
1233
  // can find out the virtual memory area that contains stack_start by
1234
  // reading /proc/self/maps, it should be the last vma in /proc/self/maps,
1235
  // and its upper limit is the real stack top. (again, this would fail if
1236
  // running inside chroot, because /proc may not exist.)
1237

1238
  uintptr_t stack_top;
1239
  address low, high;
1240
  if (find_vma((address)stack_start, &low, &high)) {
1241
    // success, "high" is the true stack top. (ignore "low", because initial
1242
    // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
1243
    stack_top = (uintptr_t)high;
1244
  } else {
1245
    // failed, likely because /proc/self/maps does not exist
1246
    warning("Can't detect primordial thread stack location - find_vma failed");
1247
    // best effort: stack_start is normally within a few pages below the real
1248
    // stack top, use it as stack top, and reduce stack size so we won't put
1249
    // guard page outside stack.
1250
    stack_top = stack_start;
1251
    stack_size -= 16 * page_size();
1252
  }
1253

1254
  // stack_top could be partially down the page so align it
1255
  stack_top = align_up(stack_top, page_size());
1256

1257
  // Allowed stack value is minimum of max_size and what we derived from rlimit
1258
  if (max_size > 0) {
1259
    _initial_thread_stack_size = MIN2(max_size, stack_size);
1260
  } else {
1261
    // Accept the rlimit max, but if stack is unlimited then it will be huge, so
1262
    // clamp it at 8MB as we do on Solaris
1263
    _initial_thread_stack_size = MIN2(stack_size, 8*M);
1264
  }
1265
  _initial_thread_stack_size = align_down(_initial_thread_stack_size, page_size());
1266
  _initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
1267

1268
  assert(_initial_thread_stack_bottom < (address)stack_top, "overflow!");
1269

1270
  if (log_is_enabled(Info, os, thread)) {
1271
    // See if we seem to be on primordial process thread
1272
    bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
1273
                      uintptr_t(&rlim) < stack_top;
1274

1275
    log_info(os, thread)("Capturing initial stack in %s thread: req. size: " SIZE_FORMAT "K, actual size: "
1276
                         SIZE_FORMAT "K, top=" INTPTR_FORMAT ", bottom=" INTPTR_FORMAT,
1277
                         primordial ? "primordial" : "user", max_size / K,  _initial_thread_stack_size / K,
1278
                         stack_top, intptr_t(_initial_thread_stack_bottom));
1279
  }
1280
}
1281

1282
////////////////////////////////////////////////////////////////////////////////
1283
// time support
1284

1285
// Time since start-up in seconds to a fine granularity.
1286
double os::elapsedTime() {
1287
  return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution
1288
}
1289

1290
jlong os::elapsed_counter() {
1291
  return javaTimeNanos() - initial_time_count;
1292
}
1293

1294
jlong os::elapsed_frequency() {
1295
  return NANOSECS_PER_SEC; // nanosecond resolution
1296
}
1297

1298
bool os::supports_vtime() { return true; }
1299

1300
double os::elapsedVTime() {
1301
  struct rusage usage;
1302
  int retval = getrusage(RUSAGE_THREAD, &usage);
1303
  if (retval == 0) {
1304
    return (double) (usage.ru_utime.tv_sec + usage.ru_stime.tv_sec) + (double) (usage.ru_utime.tv_usec + usage.ru_stime.tv_usec) / (1000 * 1000);
1305
  } else {
1306
    // better than nothing, but not much
1307
    return elapsedTime();
1308
  }
1309
}
1310

1311
void os::Linux::fast_thread_clock_init() {
1312
  if (!UseLinuxPosixThreadCPUClocks) {
1313
    return;
1314
  }
1315
  clockid_t clockid;
1316
  struct timespec tp;
1317
  int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
1318
      (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
1319

1320
  // Switch to using fast clocks for thread cpu time if
1321
  // the clock_getres() returns 0 error code.
1322
  // Note, that some kernels may support the current thread
1323
  // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
1324
  // returned by the pthread_getcpuclockid().
1325
  // If the fast Posix clocks are supported then the clock_getres()
1326
  // must return at least tp.tv_sec == 0 which means a resolution
1327
  // better than 1 sec. This is extra check for reliability.
1328

1329
  if (pthread_getcpuclockid_func &&
1330
      pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
1331
      clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
1332
    _supports_fast_thread_cpu_time = true;
1333
    _pthread_getcpuclockid = pthread_getcpuclockid_func;
1334
  }
1335
}
1336

1337
// Return the real, user, and system times in seconds from an
1338
// arbitrary fixed point in the past.
1339
bool os::getTimesSecs(double* process_real_time,
1340
                      double* process_user_time,
1341
                      double* process_system_time) {
1342
  struct tms ticks;
1343
  clock_t real_ticks = times(&ticks);
1344

1345
  if (real_ticks == (clock_t) (-1)) {
1346
    return false;
1347
  } else {
1348
    double ticks_per_second = (double) clock_tics_per_sec;
1349
    *process_user_time = ((double) ticks.tms_utime) / ticks_per_second;
1350
    *process_system_time = ((double) ticks.tms_stime) / ticks_per_second;
1351
    *process_real_time = ((double) real_ticks) / ticks_per_second;
1352

1353
    return true;
1354
  }
1355
}
1356

1357

1358
char * os::local_time_string(char *buf, size_t buflen) {
1359
  struct tm t;
1360
  time_t long_time;
1361
  time(&long_time);
1362
  localtime_r(&long_time, &t);
1363
  jio_snprintf(buf, buflen, "%d-%02d-%02d %02d:%02d:%02d",
1364
               t.tm_year + 1900, t.tm_mon + 1, t.tm_mday,
1365
               t.tm_hour, t.tm_min, t.tm_sec);
1366
  return buf;
1367
}
1368

1369
struct tm* os::localtime_pd(const time_t* clock, struct tm*  res) {
1370
  return localtime_r(clock, res);
1371
}
1372

1373
// thread_id is kernel thread id (similar to Solaris LWP id)
1374
intx os::current_thread_id() { return os::Linux::gettid(); }
1375
int os::current_process_id() {
1376
  return ::getpid();
1377
}
1378

1379
// DLL functions
1380

1381
const char* os::dll_file_extension() { return ".so"; }
1382

1383
// This must be hard coded because it's the system's temporary
1384
// directory not the java application's temp directory, ala java.io.tmpdir.
1385
const char* os::get_temp_directory() {
1386
#ifndef __ANDROID__
1387
  return "/tmp";
1388
#else
1389
  return "/data/tmp";
1390
#endif
1391
}
1392

1393
static bool file_exists(const char* filename) {
1394
  struct stat statbuf;
1395
  if (filename == NULL || strlen(filename) == 0) {
1396
    return false;
1397
  }
1398
  return os::stat(filename, &statbuf) == 0;
1399
}
1400

1401
// check if addr is inside libjvm.so
1402
bool os::address_is_in_vm(address addr) {
1403
  static address libjvm_base_addr;
1404
  Dl_info dlinfo;
1405

1406
  if (libjvm_base_addr == NULL) {
1407
    if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
1408
      libjvm_base_addr = (address)dlinfo.dli_fbase;
1409
    }
1410
    assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
1411
  }
1412

1413
  if (dladdr((void *)addr, &dlinfo) != 0) {
1414
    if (libjvm_base_addr == (address)dlinfo.dli_fbase) return true;
1415
  }
1416

1417
  return false;
1418
}
1419

1420
bool os::dll_address_to_function_name(address addr, char *buf,
1421
                                      int buflen, int *offset,
1422
                                      bool demangle) {
1423
  // buf is not optional, but offset is optional
1424
  assert(buf != NULL, "sanity check");
1425

1426
  Dl_info dlinfo;
1427

1428
  if (dladdr((void*)addr, &dlinfo) != 0) {
1429
    // see if we have a matching symbol
1430
    if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
1431
      if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
1432
        jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
1433
      }
1434
      if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
1435
      return true;
1436
    }
1437
    // no matching symbol so try for just file info
1438
    if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
1439
      if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
1440
                          buf, buflen, offset, dlinfo.dli_fname, demangle)) {
1441
        return true;
1442
      }
1443
    }
1444
  }
1445

1446
  buf[0] = '\0';
1447
  if (offset != NULL) *offset = -1;
1448
  return false;
1449
}
1450

1451
struct _address_to_library_name {
1452
  address addr;          // input : memory address
1453
  size_t  buflen;        //         size of fname
1454
  char*   fname;         // output: library name
1455
  address base;          //         library base addr
1456
};
1457

1458
static int address_to_library_name_callback(struct dl_phdr_info *info,
1459
                                            size_t size, void *data) {
1460
  int i;
1461
  bool found = false;
1462
  address libbase = NULL;
1463
  struct _address_to_library_name * d = (struct _address_to_library_name *)data;
1464

1465
  // iterate through all loadable segments
1466
  for (i = 0; i < info->dlpi_phnum; i++) {
1467
    address segbase = (address)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
1468
    if (info->dlpi_phdr[i].p_type == PT_LOAD) {
1469
      // base address of a library is the lowest address of its loaded
1470
      // segments.
1471
      if (libbase == NULL || libbase > segbase) {
1472
        libbase = segbase;
1473
      }
1474
      // see if 'addr' is within current segment
1475
      if (segbase <= d->addr &&
1476
          d->addr < segbase + info->dlpi_phdr[i].p_memsz) {
1477
        found = true;
1478
      }
1479
    }
1480
  }
1481

1482
  // dlpi_name is NULL or empty if the ELF file is executable, return 0
1483
  // so dll_address_to_library_name() can fall through to use dladdr() which
1484
  // can figure out executable name from argv[0].
1485
  if (found && info->dlpi_name && info->dlpi_name[0]) {
1486
    d->base = libbase;
1487
    if (d->fname) {
1488
      jio_snprintf(d->fname, d->buflen, "%s", info->dlpi_name);
1489
    }
1490
    return 1;
1491
  }
1492
  return 0;
1493
}
1494

1495
bool os::dll_address_to_library_name(address addr, char* buf,
1496
                                     int buflen, int* offset) {
1497
  // buf is not optional, but offset is optional
1498
  assert(buf != NULL, "sanity check");
1499

1500
  Dl_info dlinfo;
1501
  struct _address_to_library_name data;
1502

1503
  // There is a bug in old glibc dladdr() implementation that it could resolve
1504
  // to wrong library name if the .so file has a base address != NULL. Here
1505
  // we iterate through the program headers of all loaded libraries to find
1506
  // out which library 'addr' really belongs to. This workaround can be
1507
  // removed once the minimum requirement for glibc is moved to 2.3.x.
1508
  data.addr = addr;
1509
  data.fname = buf;
1510
  data.buflen = buflen;
1511
  data.base = NULL;
1512
  int rslt = dl_iterate_phdr(address_to_library_name_callback, (void *)&data);
1513

1514
  if (rslt) {
1515
    // buf already contains library name
1516
    if (offset) *offset = addr - data.base;
1517
    return true;
1518
  }
1519
  if (dladdr((void*)addr, &dlinfo) != 0) {
1520
    if (dlinfo.dli_fname != NULL) {
1521
      jio_snprintf(buf, buflen, "%s", dlinfo.dli_fname);
1522
    }
1523
    if (dlinfo.dli_fbase != NULL && offset != NULL) {
1524
      *offset = addr - (address)dlinfo.dli_fbase;
1525
    }
1526
    return true;
1527
  }
1528

1529
  buf[0] = '\0';
1530
  if (offset) *offset = -1;
1531
  return false;
1532
}
1533

1534
static bool read_so_path_from_maps(const char* so_name, char* buf, int buflen) {
1535
  FILE *fp = fopen("/proc/self/maps", "r");
1536
  assert(fp, "Failed to open /proc/self/maps");
1537
  if (!fp) {
1538
    return false;
1539
  }
1540

1541
  char maps_buffer[2048];
1542
  while (fgets(maps_buffer, 2048, fp) != NULL) {
1543
    if (strstr(maps_buffer, so_name) == NULL) {
1544
      continue;
1545
    }
1546

1547
    char *so_path = strchr(maps_buffer, '/');
1548
    so_path[strlen(so_path) - 1] = '\0'; // Cut trailing \n
1549
    jio_snprintf(buf, buflen, "%s", so_path);
1550
    fclose(fp);
1551
    return true;
1552
  }
1553

1554
  fclose(fp);
1555
  return false;
1556
}
1557

1558
// Loads .dll/.so and
1559
// in case of error it checks if .dll/.so was built for the
1560
// same architecture as Hotspot is running on
1561

1562

1563
// Remember the stack's state. The Linux dynamic linker will change
1564
// the stack to 'executable' at most once, so we must safepoint only once.
1565
bool os::Linux::_stack_is_executable = false;
1566

1567
// VM operation that loads a library.  This is necessary if stack protection
1568
// of the Java stacks can be lost during loading the library.  If we
1569
// do not stop the Java threads, they can stack overflow before the stacks
1570
// are protected again.
1571
class VM_LinuxDllLoad: public VM_Operation {
1572
 private:
1573
  const char *_filename;
1574
  char *_ebuf;
1575
  int _ebuflen;
1576
  void *_lib;
1577
 public:
1578
  VM_LinuxDllLoad(const char *fn, char *ebuf, int ebuflen) :
1579
    _filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
1580
  VMOp_Type type() const { return VMOp_LinuxDllLoad; }
1581
  void doit() {
1582
    _lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
1583
    os::Linux::_stack_is_executable = true;
1584
  }
1585
  void* loaded_library() { return _lib; }
1586
};
1587

1588
void * os::dll_load(const char *filename, char *ebuf, int ebuflen) {
1589
  void * result = NULL;
1590
  bool load_attempted = false;
1591

1592
  log_info(os)("attempting shared library load of %s", filename);
1593

1594
  // Check whether the library to load might change execution rights
1595
  // of the stack. If they are changed, the protection of the stack
1596
  // guard pages will be lost. We need a safepoint to fix this.
1597
  //
1598
  // See Linux man page execstack(8) for more info.
1599
  if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) {
1600
    if (!ElfFile::specifies_noexecstack(filename)) {
1601
      if (!is_init_completed()) {
1602
        os::Linux::_stack_is_executable = true;
1603
        // This is OK - No Java threads have been created yet, and hence no
1604
        // stack guard pages to fix.
1605
        //
1606
        // Dynamic loader will make all stacks executable after
1607
        // this function returns, and will not do that again.
1608
        assert(Threads::number_of_threads() == 0, "no Java threads should exist yet.");
1609
      } else {
1610
        warning("You have loaded library %s which might have disabled stack guard. "
1611
                "The VM will try to fix the stack guard now.\n"
1612
                "It's highly recommended that you fix the library with "
1613
                "'execstack -c <libfile>', or link it with '-z noexecstack'.",
1614
                filename);
1615

1616
        JavaThread *jt = JavaThread::current();
1617
        if (jt->thread_state() != _thread_in_native) {
1618
          // This happens when a compiler thread tries to load a hsdis-<arch>.so file
1619
          // that requires ExecStack. Cannot enter safe point. Let's give up.
1620
          warning("Unable to fix stack guard. Giving up.");
1621
        } else {
1622
          if (!LoadExecStackDllInVMThread) {
1623
            // This is for the case where the DLL has an static
1624
            // constructor function that executes JNI code. We cannot
1625
            // load such DLLs in the VMThread.
1626
            result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1627
          }
1628

1629
          ThreadInVMfromNative tiv(jt);
1630
          debug_only(VMNativeEntryWrapper vew;)
1631

1632
          VM_LinuxDllLoad op(filename, ebuf, ebuflen);
1633
          VMThread::execute(&op);
1634
          if (LoadExecStackDllInVMThread) {
1635
            result = op.loaded_library();
1636
          }
1637
          load_attempted = true;
1638
        }
1639
      }
1640
    }
1641
  }
1642

1643
  if (!load_attempted) {
1644
    result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1645
  }
1646

1647
  if (result != NULL) {
1648
    // Successful loading
1649
    return result;
1650
  }
1651

1652
  Elf32_Ehdr elf_head;
1653
  int diag_msg_max_length=ebuflen-strlen(ebuf);
1654
  char* diag_msg_buf=ebuf+strlen(ebuf);
1655

1656
  if (diag_msg_max_length==0) {
1657
    // No more space in ebuf for additional diagnostics message
1658
    return NULL;
1659
  }
1660

1661

1662
  int file_descriptor= ::open(filename, O_RDONLY | O_NONBLOCK);
1663

1664
  if (file_descriptor < 0) {
1665
    // Can't open library, report dlerror() message
1666
    return NULL;
1667
  }
1668

1669
  bool failed_to_read_elf_head=
1670
    (sizeof(elf_head)!=
1671
     (::read(file_descriptor, &elf_head,sizeof(elf_head))));
1672

1673
  ::close(file_descriptor);
1674
  if (failed_to_read_elf_head) {
1675
    // file i/o error - report dlerror() msg
1676
    return NULL;
1677
  }
1678

1679
  if (elf_head.e_ident[EI_DATA] != LITTLE_ENDIAN_ONLY(ELFDATA2LSB) BIG_ENDIAN_ONLY(ELFDATA2MSB)) {
1680
    // handle invalid/out of range endianness values
1681
    if (elf_head.e_ident[EI_DATA] == 0 || elf_head.e_ident[EI_DATA] > 2) {
1682
      return NULL;
1683
    }
1684

1685
#if defined(VM_LITTLE_ENDIAN)
1686
    // VM is LE, shared object BE
1687
    elf_head.e_machine = be16toh(elf_head.e_machine);
1688
#else
1689
    // VM is BE, shared object LE
1690
    elf_head.e_machine = le16toh(elf_head.e_machine);
1691
#endif
1692
  }
1693

1694
  typedef struct {
1695
    Elf32_Half    code;         // Actual value as defined in elf.h
1696
    Elf32_Half    compat_class; // Compatibility of archs at VM's sense
1697
    unsigned char elf_class;    // 32 or 64 bit
1698
    unsigned char endianness;   // MSB or LSB
1699
    char*         name;         // String representation
1700
  } arch_t;
1701

1702
#ifndef EM_AARCH64
1703
  #define EM_AARCH64    183               /* ARM AARCH64 */
1704
#endif
1705
#ifndef EM_RISCV
1706
  #define EM_RISCV      243               /* RISC-V */
1707
#endif
1708

1709
  static const arch_t arch_array[]={
1710
    {EM_386,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1711
    {EM_486,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1712
    {EM_IA_64,       EM_IA_64,   ELFCLASS64, ELFDATA2LSB, (char*)"IA 64"},
1713
    {EM_X86_64,      EM_X86_64,  ELFCLASS64, ELFDATA2LSB, (char*)"AMD 64"},
1714
    {EM_SPARC,       EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1715
    {EM_SPARC32PLUS, EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1716
    {EM_SPARCV9,     EM_SPARCV9, ELFCLASS64, ELFDATA2MSB, (char*)"Sparc v9 64"},
1717
    {EM_PPC,         EM_PPC,     ELFCLASS32, ELFDATA2MSB, (char*)"Power PC 32"},
1718
#if defined(VM_LITTLE_ENDIAN)
1719
    {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2LSB, (char*)"Power PC 64 LE"},
1720
    {EM_SH,          EM_SH,      ELFCLASS32, ELFDATA2LSB, (char*)"SuperH"},
1721
#else
1722
    {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2MSB, (char*)"Power PC 64"},
1723
    {EM_SH,          EM_SH,      ELFCLASS32, ELFDATA2MSB, (char*)"SuperH BE"},
1724
#endif
1725
    {EM_ARM,         EM_ARM,     ELFCLASS32, ELFDATA2LSB, (char*)"ARM"},
1726
    // we only support 64 bit z architecture
1727
    {EM_S390,        EM_S390,    ELFCLASS64, ELFDATA2MSB, (char*)"IBM System/390"},
1728
    {EM_ALPHA,       EM_ALPHA,   ELFCLASS64, ELFDATA2LSB, (char*)"Alpha"},
1729
    {EM_MIPS_RS3_LE, EM_MIPS_RS3_LE, ELFCLASS32, ELFDATA2LSB, (char*)"MIPSel"},
1730
    {EM_MIPS,        EM_MIPS,    ELFCLASS32, ELFDATA2MSB, (char*)"MIPS"},
1731
    {EM_PARISC,      EM_PARISC,  ELFCLASS32, ELFDATA2MSB, (char*)"PARISC"},
1732
    {EM_68K,         EM_68K,     ELFCLASS32, ELFDATA2MSB, (char*)"M68k"},
1733
    {EM_AARCH64,     EM_AARCH64, ELFCLASS64, ELFDATA2LSB, (char*)"AARCH64"},
1734
    {EM_RISCV,       EM_RISCV,   ELFCLASS64, ELFDATA2LSB, (char*)"RISC-V"},
1735
  };
1736

1737
#if  (defined IA32)
1738
  static  Elf32_Half running_arch_code=EM_386;
1739
#elif   (defined AMD64) || (defined X32)
1740
  static  Elf32_Half running_arch_code=EM_X86_64;
1741
#elif  (defined IA64)
1742
  static  Elf32_Half running_arch_code=EM_IA_64;
1743
#elif  (defined __sparc) && (defined _LP64)
1744
  static  Elf32_Half running_arch_code=EM_SPARCV9;
1745
#elif  (defined __sparc) && (!defined _LP64)
1746
  static  Elf32_Half running_arch_code=EM_SPARC;
1747
#elif  (defined __powerpc64__)
1748
  static  Elf32_Half running_arch_code=EM_PPC64;
1749
#elif  (defined __powerpc__)
1750
  static  Elf32_Half running_arch_code=EM_PPC;
1751
#elif  (defined AARCH64)
1752
  static  Elf32_Half running_arch_code=EM_AARCH64;
1753
#elif  (defined ARM)
1754
  static  Elf32_Half running_arch_code=EM_ARM;
1755
#elif  (defined S390)
1756
  static  Elf32_Half running_arch_code=EM_S390;
1757
#elif  (defined ALPHA)
1758
  static  Elf32_Half running_arch_code=EM_ALPHA;
1759
#elif  (defined MIPSEL)
1760
  static  Elf32_Half running_arch_code=EM_MIPS_RS3_LE;
1761
#elif  (defined PARISC)
1762
  static  Elf32_Half running_arch_code=EM_PARISC;
1763
#elif  (defined MIPS)
1764
  static  Elf32_Half running_arch_code=EM_MIPS;
1765
#elif  (defined M68K)
1766
  static  Elf32_Half running_arch_code=EM_68K;
1767
#elif  (defined SH)
1768
  static  Elf32_Half running_arch_code=EM_SH;
1769
#elif  (defined RISCV)
1770
  static  Elf32_Half running_arch_code=EM_RISCV;
1771
#else
1772
    #error Method os::dll_load requires that one of following is defined:\
1773
        AARCH64, ALPHA, ARM, AMD64, IA32, IA64, M68K, MIPS, MIPSEL, PARISC, __powerpc__, __powerpc64__, RISCV, S390, SH, __sparc
1774
#endif
1775

1776
  // Identify compatibility class for VM's architecture and library's architecture
1777
  // Obtain string descriptions for architectures
1778

1779
  arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL};
1780
  int running_arch_index=-1;
1781

1782
  for (unsigned int i=0; i < ARRAY_SIZE(arch_array); i++) {
1783
    if (running_arch_code == arch_array[i].code) {
1784
      running_arch_index    = i;
1785
    }
1786
    if (lib_arch.code == arch_array[i].code) {
1787
      lib_arch.compat_class = arch_array[i].compat_class;
1788
      lib_arch.name         = arch_array[i].name;
1789
    }
1790
  }
1791

1792
  assert(running_arch_index != -1,
1793
         "Didn't find running architecture code (running_arch_code) in arch_array");
1794
  if (running_arch_index == -1) {
1795
    // Even though running architecture detection failed
1796
    // we may still continue with reporting dlerror() message
1797
    return NULL;
1798
  }
1799

1800
  if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) {
1801
    if (lib_arch.name != NULL) {
1802
      ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1803
                 " (Possible cause: can't load %s .so on a %s platform)",
1804
                 lib_arch.name, arch_array[running_arch_index].name);
1805
    } else {
1806
      ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1807
                 " (Possible cause: can't load this .so (machine code=0x%x) on a %s platform)",
1808
                 lib_arch.code, arch_array[running_arch_index].name);
1809
    }
1810
    return NULL;
1811
  }
1812

1813
  if (lib_arch.endianness != arch_array[running_arch_index].endianness) {
1814
    ::snprintf(diag_msg_buf, diag_msg_max_length-1, " (Possible cause: endianness mismatch)");
1815
    return NULL;
1816
  }
1817

1818
  // ELF file class/capacity : 0 - invalid, 1 - 32bit, 2 - 64bit
1819
  if (lib_arch.elf_class > 2 || lib_arch.elf_class < 1) {
1820
    ::snprintf(diag_msg_buf, diag_msg_max_length-1, " (Possible cause: invalid ELF file class)");
1821
    return NULL;
1822
  }
1823

1824
  if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
1825
    ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1826
               " (Possible cause: architecture word width mismatch, can't load %d-bit .so on a %d-bit platform)",
1827
               (int) lib_arch.elf_class * 32, arch_array[running_arch_index].elf_class * 32);
1828
    return NULL;
1829
  }
1830

1831
  return NULL;
1832
}
1833

1834
void * os::Linux::dlopen_helper(const char *filename, char *ebuf,
1835
                                int ebuflen) {
1836
  void * result = ::dlopen(filename, RTLD_LAZY);
1837
  if (result == NULL) {
1838
    const char* error_report = ::dlerror();
1839
    if (error_report == NULL) {
1840
      error_report = "dlerror returned no error description";
1841
    }
1842
    if (ebuf != NULL && ebuflen > 0) {
1843
      ::strncpy(ebuf, error_report, ebuflen-1);
1844
      ebuf[ebuflen-1]='\0';
1845
    }
1846
    Events::log(NULL, "Loading shared library %s failed, %s", filename, error_report);
1847
    log_info(os)("shared library load of %s failed, %s", filename, error_report);
1848
  } else {
1849
    Events::log(NULL, "Loaded shared library %s", filename);
1850
    log_info(os)("shared library load of %s was successful", filename);
1851
  }
1852
  return result;
1853
}
1854

1855
void * os::Linux::dll_load_in_vmthread(const char *filename, char *ebuf,
1856
                                       int ebuflen) {
1857
  void * result = NULL;
1858
  if (LoadExecStackDllInVMThread) {
1859
    result = dlopen_helper(filename, ebuf, ebuflen);
1860
  }
1861

1862
  // Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a
1863
  // library that requires an executable stack, or which does not have this
1864
  // stack attribute set, dlopen changes the stack attribute to executable. The
1865
  // read protection of the guard pages gets lost.
1866
  //
1867
  // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad
1868
  // may have been queued at the same time.
1869

1870
  if (!_stack_is_executable) {
1871
    for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
1872
      StackOverflow* overflow_state = jt->stack_overflow_state();
1873
      if (!overflow_state->stack_guard_zone_unused() &&     // Stack not yet fully initialized
1874
          overflow_state->stack_guards_enabled()) {         // No pending stack overflow exceptions
1875
        if (!os::guard_memory((char *)jt->stack_end(), StackOverflow::stack_guard_zone_size())) {
1876
          warning("Attempt to reguard stack yellow zone failed.");
1877
        }
1878
      }
1879
    }
1880
  }
1881

1882
  return result;
1883
}
1884

1885
void* os::dll_lookup(void* handle, const char* name) {
1886
  void* res = dlsym(handle, name);
1887
  return res;
1888
}
1889

1890
void* os::get_default_process_handle() {
1891
  return (void*)::dlopen(NULL, RTLD_LAZY);
1892
}
1893

1894
static bool _print_ascii_file(const char* filename, outputStream* st, const char* hdr = NULL) {
1895
  int fd = ::open(filename, O_RDONLY);
1896
  if (fd == -1) {
1897
    return false;
1898
  }
1899

1900
  if (hdr != NULL) {
1901
    st->print_cr("%s", hdr);
1902
  }
1903

1904
  char buf[33];
1905
  int bytes;
1906
  buf[32] = '\0';
1907
  while ((bytes = ::read(fd, buf, sizeof(buf)-1)) > 0) {
1908
    st->print_raw(buf, bytes);
1909
  }
1910

1911
  ::close(fd);
1912

1913
  return true;
1914
}
1915

1916
static void _print_ascii_file_h(const char* header, const char* filename, outputStream* st, bool same_line = true) {
1917
  st->print("%s:%c", header, same_line ? ' ' : '\n');
1918
  if (!_print_ascii_file(filename, st)) {
1919
    st->print_cr("<Not Available>");
1920
  }
1921
}
1922

1923
void os::print_dll_info(outputStream *st) {
1924
  st->print_cr("Dynamic libraries:");
1925

1926
  char fname[32];
1927
  pid_t pid = os::Linux::gettid();
1928

1929
  jio_snprintf(fname, sizeof(fname), "/proc/%d/maps", pid);
1930

1931
  if (!_print_ascii_file(fname, st)) {
1932
    st->print_cr("Can not get library information for pid = %d", pid);
1933
  }
1934
}
1935

1936
struct loaded_modules_info_param {
1937
  os::LoadedModulesCallbackFunc callback;
1938
  void *param;
1939
};
1940

1941
static int dl_iterate_callback(struct dl_phdr_info *info, size_t size, void *data) {
1942
  if ((info->dlpi_name == NULL) || (*info->dlpi_name == '\0')) {
1943
    return 0;
1944
  }
1945

1946
  struct loaded_modules_info_param *callback_param = reinterpret_cast<struct loaded_modules_info_param *>(data);
1947
  address base = NULL;
1948
  address top = NULL;
1949
  for (int idx = 0; idx < info->dlpi_phnum; idx++) {
1950
    const ElfW(Phdr) *phdr = info->dlpi_phdr + idx;
1951
    if (phdr->p_type == PT_LOAD) {
1952
      address raw_phdr_base = reinterpret_cast<address>(info->dlpi_addr + phdr->p_vaddr);
1953

1954
      address phdr_base = align_down(raw_phdr_base, phdr->p_align);
1955
      if ((base == NULL) || (base > phdr_base)) {
1956
        base = phdr_base;
1957
      }
1958

1959
      address phdr_top = align_up(raw_phdr_base + phdr->p_memsz, phdr->p_align);
1960
      if ((top == NULL) || (top < phdr_top)) {
1961
        top = phdr_top;
1962
      }
1963
    }
1964
  }
1965

1966
  return callback_param->callback(info->dlpi_name, base, top, callback_param->param);
1967
}
1968

1969
int os::get_loaded_modules_info(os::LoadedModulesCallbackFunc callback, void *param) {
1970
  struct loaded_modules_info_param callback_param = {callback, param};
1971
  return dl_iterate_phdr(&dl_iterate_callback, &callback_param);
1972
}
1973

1974
void os::print_os_info_brief(outputStream* st) {
1975
  os::Linux::print_distro_info(st);
1976

1977
  os::Posix::print_uname_info(st);
1978

1979
  os::Linux::print_libversion_info(st);
1980

1981
}
1982

1983
void os::print_os_info(outputStream* st) {
1984
  st->print_cr("OS:");
1985

1986
  os::Linux::print_distro_info(st);
1987

1988
  os::Posix::print_uname_info(st);
1989

1990
  os::Linux::print_uptime_info(st);
1991

1992
  // Print warning if unsafe chroot environment detected
1993
  if (unsafe_chroot_detected) {
1994
    st->print_cr("WARNING!! %s", unstable_chroot_error);
1995
  }
1996

1997
  os::Linux::print_libversion_info(st);
1998

1999
  os::Posix::print_rlimit_info(st);
2000

2001
  os::Posix::print_load_average(st);
2002
  st->cr();
2003

2004
  os::Linux::print_system_memory_info(st);
2005
  st->cr();
2006

2007
  os::Linux::print_process_memory_info(st);
2008
  st->cr();
2009

2010
  os::Linux::print_proc_sys_info(st);
2011
  st->cr();
2012

2013
  if (os::Linux::print_ld_preload_file(st)) {
2014
    st->cr();
2015
  }
2016

2017
  if (os::Linux::print_container_info(st)) {
2018
    st->cr();
2019
  }
2020

2021
  VM_Version::print_platform_virtualization_info(st);
2022

2023
  os::Linux::print_steal_info(st);
2024
}
2025

2026
// Try to identify popular distros.
2027
// Most Linux distributions have a /etc/XXX-release file, which contains
2028
// the OS version string. Newer Linux distributions have a /etc/lsb-release
2029
// file that also contains the OS version string. Some have more than one
2030
// /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
2031
// /etc/redhat-release.), so the order is important.
2032
// Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
2033
// their own specific XXX-release file as well as a redhat-release file.
2034
// Because of this the XXX-release file needs to be searched for before the
2035
// redhat-release file.
2036
// Since Red Hat and SuSE have an lsb-release file that is not very descriptive the
2037
// search for redhat-release / SuSE-release needs to be before lsb-release.
2038
// Since the lsb-release file is the new standard it needs to be searched
2039
// before the older style release files.
2040
// Searching system-release (Red Hat) and os-release (other Linuxes) are a
2041
// next to last resort.  The os-release file is a new standard that contains
2042
// distribution information and the system-release file seems to be an old
2043
// standard that has been replaced by the lsb-release and os-release files.
2044
// Searching for the debian_version file is the last resort.  It contains
2045
// an informative string like "6.0.6" or "wheezy/sid". Because of this
2046
// "Debian " is printed before the contents of the debian_version file.
2047

2048
const char* distro_files[] = {
2049
  "/etc/oracle-release",
2050
  "/etc/mandriva-release",
2051
  "/etc/mandrake-release",
2052
  "/etc/sun-release",
2053
  "/etc/redhat-release",
2054
  "/etc/SuSE-release",
2055
  "/etc/lsb-release",
2056
  "/etc/turbolinux-release",
2057
  "/etc/gentoo-release",
2058
  "/etc/ltib-release",
2059
  "/etc/angstrom-version",
2060
  "/etc/system-release",
2061
  "/etc/os-release",
2062
  NULL };
2063

2064
void os::Linux::print_distro_info(outputStream* st) {
2065
  for (int i = 0;; i++) {
2066
    const char* file = distro_files[i];
2067
    if (file == NULL) {
2068
      break;  // done
2069
    }
2070
    // If file prints, we found it.
2071
    if (_print_ascii_file(file, st)) {
2072
      return;
2073
    }
2074
  }
2075

2076
  if (file_exists("/etc/debian_version")) {
2077
    st->print("Debian ");
2078
    _print_ascii_file("/etc/debian_version", st);
2079
  } else {
2080
    st->print_cr("Linux");
2081
  }
2082
}
2083

2084
static void parse_os_info_helper(FILE* fp, char* distro, size_t length, bool get_first_line) {
2085
  char buf[256];
2086
  while (fgets(buf, sizeof(buf), fp)) {
2087
    // Edit out extra stuff in expected format
2088
    if (strstr(buf, "DISTRIB_DESCRIPTION=") != NULL || strstr(buf, "PRETTY_NAME=") != NULL) {
2089
      char* ptr = strstr(buf, "\"");  // the name is in quotes
2090
      if (ptr != NULL) {
2091
        ptr++; // go beyond first quote
2092
        char* nl = strchr(ptr, '\"');
2093
        if (nl != NULL) *nl = '\0';
2094
        strncpy(distro, ptr, length);
2095
      } else {
2096
        ptr = strstr(buf, "=");
2097
        ptr++; // go beyond equals then
2098
        char* nl = strchr(ptr, '\n');
2099
        if (nl != NULL) *nl = '\0';
2100
        strncpy(distro, ptr, length);
2101
      }
2102
      return;
2103
    } else if (get_first_line) {
2104
      char* nl = strchr(buf, '\n');
2105
      if (nl != NULL) *nl = '\0';
2106
      strncpy(distro, buf, length);
2107
      return;
2108
    }
2109
  }
2110
  // print last line and close
2111
  char* nl = strchr(buf, '\n');
2112
  if (nl != NULL) *nl = '\0';
2113
  strncpy(distro, buf, length);
2114
}
2115

2116
static void parse_os_info(char* distro, size_t length, const char* file) {
2117
  FILE* fp = fopen(file, "r");
2118
  if (fp != NULL) {
2119
    // if suse format, print out first line
2120
    bool get_first_line = (strcmp(file, "/etc/SuSE-release") == 0);
2121
    parse_os_info_helper(fp, distro, length, get_first_line);
2122
    fclose(fp);
2123
  }
2124
}
2125

2126
void os::get_summary_os_info(char* buf, size_t buflen) {
2127
  for (int i = 0;; i++) {
2128
    const char* file = distro_files[i];
2129
    if (file == NULL) {
2130
      break; // ran out of distro_files
2131
    }
2132
    if (file_exists(file)) {
2133
      parse_os_info(buf, buflen, file);
2134
      return;
2135
    }
2136
  }
2137
  // special case for debian
2138
  if (file_exists("/etc/debian_version")) {
2139
    strncpy(buf, "Debian ", buflen);
2140
    if (buflen > 7) {
2141
      parse_os_info(&buf[7], buflen-7, "/etc/debian_version");
2142
    }
2143
  } else {
2144
    strncpy(buf, "Linux", buflen);
2145
  }
2146
}
2147

2148
void os::Linux::print_libversion_info(outputStream* st) {
2149
  // libc, pthread
2150
  st->print("libc: ");
2151
  st->print("%s ", os::Linux::libc_version());
2152
  st->print("%s ", os::Linux::libpthread_version());
2153
  st->cr();
2154
}
2155

2156
void os::Linux::print_proc_sys_info(outputStream* st) {
2157
  _print_ascii_file_h("/proc/sys/kernel/threads-max (system-wide limit on the number of threads)",
2158
                      "/proc/sys/kernel/threads-max", st);
2159
  _print_ascii_file_h("/proc/sys/vm/max_map_count (maximum number of memory map areas a process may have)",
2160
                      "/proc/sys/vm/max_map_count", st);
2161
  _print_ascii_file_h("/proc/sys/kernel/pid_max (system-wide limit on number of process identifiers)",
2162
                      "/proc/sys/kernel/pid_max", st);
2163
}
2164

2165
void os::Linux::print_system_memory_info(outputStream* st) {
2166
  _print_ascii_file_h("/proc/meminfo", "/proc/meminfo", st, false);
2167
  st->cr();
2168

2169
  // some information regarding THPs; for details see
2170
  // https://www.kernel.org/doc/Documentation/vm/transhuge.txt
2171
  _print_ascii_file_h("/sys/kernel/mm/transparent_hugepage/enabled",
2172
                      "/sys/kernel/mm/transparent_hugepage/enabled", st);
2173
  _print_ascii_file_h("/sys/kernel/mm/transparent_hugepage/defrag (defrag/compaction efforts parameter)",
2174
                      "/sys/kernel/mm/transparent_hugepage/defrag", st);
2175
}
2176

2177
void os::Linux::print_process_memory_info(outputStream* st) {
2178

2179
  st->print_cr("Process Memory:");
2180

2181
  // Print virtual and resident set size; peak values; swap; and for
2182
  //  rss its components if the kernel is recent enough.
2183
  ssize_t vmsize = -1, vmpeak = -1, vmswap = -1,
2184
      vmrss = -1, vmhwm = -1, rssanon = -1, rssfile = -1, rssshmem = -1;
2185
  const int num_values = 8;
2186
  int num_found = 0;
2187
  FILE* f = ::fopen("/proc/self/status", "r");
2188
  char buf[256];
2189
  if (f != NULL) {
2190
    while (::fgets(buf, sizeof(buf), f) != NULL && num_found < num_values) {
2191
      if ( (vmsize == -1    && sscanf(buf, "VmSize: " SSIZE_FORMAT " kB", &vmsize) == 1) ||
2192
           (vmpeak == -1    && sscanf(buf, "VmPeak: " SSIZE_FORMAT " kB", &vmpeak) == 1) ||
2193
           (vmswap == -1    && sscanf(buf, "VmSwap: " SSIZE_FORMAT " kB", &vmswap) == 1) ||
2194
           (vmhwm == -1     && sscanf(buf, "VmHWM: " SSIZE_FORMAT " kB", &vmhwm) == 1) ||
2195
           (vmrss == -1     && sscanf(buf, "VmRSS: " SSIZE_FORMAT " kB", &vmrss) == 1) ||
2196
           (rssanon == -1   && sscanf(buf, "RssAnon: " SSIZE_FORMAT " kB", &rssanon) == 1) ||
2197
           (rssfile == -1   && sscanf(buf, "RssFile: " SSIZE_FORMAT " kB", &rssfile) == 1) ||
2198
           (rssshmem == -1  && sscanf(buf, "RssShmem: " SSIZE_FORMAT " kB", &rssshmem) == 1)
2199
           )
2200
      {
2201
        num_found ++;
2202
      }
2203
    }
2204
    fclose(f);
2205

2206
    st->print_cr("Virtual Size: " SSIZE_FORMAT "K (peak: " SSIZE_FORMAT "K)", vmsize, vmpeak);
2207
    st->print("Resident Set Size: " SSIZE_FORMAT "K (peak: " SSIZE_FORMAT "K)", vmrss, vmhwm);
2208
    if (rssanon != -1) { // requires kernel >= 4.5
2209
      st->print(" (anon: " SSIZE_FORMAT "K, file: " SSIZE_FORMAT "K, shmem: " SSIZE_FORMAT "K)",
2210
                  rssanon, rssfile, rssshmem);
2211
    }
2212
    st->cr();
2213
    if (vmswap != -1) { // requires kernel >= 2.6.34
2214
      st->print_cr("Swapped out: " SSIZE_FORMAT "K", vmswap);
2215
    }
2216
  } else {
2217
    st->print_cr("Could not open /proc/self/status to get process memory related information");
2218
  }
2219

2220
  // Print glibc outstanding allocations.
2221
  // (note: there is no implementation of mallinfo for muslc)
2222
#ifdef __GLIBC__
2223
  size_t total_allocated = 0;
2224
  bool might_have_wrapped = false;
2225
  if (_mallinfo2 != NULL) {
2226
    struct glibc_mallinfo2 mi = _mallinfo2();
2227
    total_allocated = mi.uordblks;
2228
  } else if (_mallinfo != NULL) {
2229
    // mallinfo is an old API. Member names mean next to nothing and, beyond that, are int.
2230
    // So values may have wrapped around. Still useful enough to see how much glibc thinks
2231
    // we allocated.
2232
    struct glibc_mallinfo mi = _mallinfo();
2233
    total_allocated = (size_t)(unsigned)mi.uordblks;
2234
    // Since mallinfo members are int, glibc values may have wrapped. Warn about this.
2235
    might_have_wrapped = (vmrss * K) > UINT_MAX && (vmrss * K) > (total_allocated + UINT_MAX);
2236
  }
2237
  if (_mallinfo2 != NULL || _mallinfo != NULL) {
2238
    st->print_cr("C-Heap outstanding allocations: " SIZE_FORMAT "K%s",
2239
                 total_allocated / K,
2240
                 might_have_wrapped ? " (may have wrapped)" : "");
2241
  }
2242
#endif // __GLIBC__
2243

2244
}
2245

2246
bool os::Linux::print_ld_preload_file(outputStream* st) {
2247
  return _print_ascii_file("/etc/ld.so.preload", st, "/etc/ld.so.preload:");
2248
}
2249

2250
void os::Linux::print_uptime_info(outputStream* st) {
2251
  struct sysinfo sinfo;
2252
  int ret = sysinfo(&sinfo);
2253
  if (ret == 0) {
2254
    os::print_dhm(st, "OS uptime:", (long) sinfo.uptime);
2255
  }
2256
}
2257

2258
bool os::Linux::print_container_info(outputStream* st) {
2259
  if (!OSContainer::is_containerized()) {
2260
    return false;
2261
  }
2262

2263
  st->print_cr("container (cgroup) information:");
2264

2265
  const char *p_ct = OSContainer::container_type();
2266
  st->print_cr("container_type: %s", p_ct != NULL ? p_ct : "not supported");
2267

2268
  char *p = OSContainer::cpu_cpuset_cpus();
2269
  st->print_cr("cpu_cpuset_cpus: %s", p != NULL ? p : "not supported");
2270
  free(p);
2271

2272
  p = OSContainer::cpu_cpuset_memory_nodes();
2273
  st->print_cr("cpu_memory_nodes: %s", p != NULL ? p : "not supported");
2274
  free(p);
2275

2276
  int i = OSContainer::active_processor_count();
2277
  st->print("active_processor_count: ");
2278
  if (i > 0) {
2279
    st->print_cr("%d", i);
2280
  } else {
2281
    st->print_cr("not supported");
2282
  }
2283

2284
  i = OSContainer::cpu_quota();
2285
  st->print("cpu_quota: ");
2286
  if (i > 0) {
2287
    st->print_cr("%d", i);
2288
  } else {
2289
    st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no quota");
2290
  }
2291

2292
  i = OSContainer::cpu_period();
2293
  st->print("cpu_period: ");
2294
  if (i > 0) {
2295
    st->print_cr("%d", i);
2296
  } else {
2297
    st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no period");
2298
  }
2299

2300
  i = OSContainer::cpu_shares();
2301
  st->print("cpu_shares: ");
2302
  if (i > 0) {
2303
    st->print_cr("%d", i);
2304
  } else {
2305
    st->print_cr("%s", i == OSCONTAINER_ERROR ? "not supported" : "no shares");
2306
  }
2307

2308
  jlong j = OSContainer::memory_limit_in_bytes();
2309
  st->print("memory_limit_in_bytes: ");
2310
  if (j > 0) {
2311
    st->print_cr(JLONG_FORMAT, j);
2312
  } else {
2313
    st->print_cr("%s", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2314
  }
2315

2316
  j = OSContainer::memory_and_swap_limit_in_bytes();
2317
  st->print("memory_and_swap_limit_in_bytes: ");
2318
  if (j > 0) {
2319
    st->print_cr(JLONG_FORMAT, j);
2320
  } else {
2321
    st->print_cr("%s", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2322
  }
2323

2324
  j = OSContainer::memory_soft_limit_in_bytes();
2325
  st->print("memory_soft_limit_in_bytes: ");
2326
  if (j > 0) {
2327
    st->print_cr(JLONG_FORMAT, j);
2328
  } else {
2329
    st->print_cr("%s", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2330
  }
2331

2332
  j = OSContainer::OSContainer::memory_usage_in_bytes();
2333
  st->print("memory_usage_in_bytes: ");
2334
  if (j > 0) {
2335
    st->print_cr(JLONG_FORMAT, j);
2336
  } else {
2337
    st->print_cr("%s", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2338
  }
2339

2340
  j = OSContainer::OSContainer::memory_max_usage_in_bytes();
2341
  st->print("memory_max_usage_in_bytes: ");
2342
  if (j > 0) {
2343
    st->print_cr(JLONG_FORMAT, j);
2344
  } else {
2345
    st->print_cr("%s", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2346
  }
2347

2348
  return true;
2349
}
2350

2351
void os::Linux::print_steal_info(outputStream* st) {
2352
  if (has_initial_tick_info) {
2353
    CPUPerfTicks pticks;
2354
    bool res = os::Linux::get_tick_information(&pticks, -1);
2355

2356
    if (res && pticks.has_steal_ticks) {
2357
      uint64_t steal_ticks_difference = pticks.steal - initial_steal_ticks;
2358
      uint64_t total_ticks_difference = pticks.total - initial_total_ticks;
2359
      double steal_ticks_perc = 0.0;
2360
      if (total_ticks_difference != 0) {
2361
        steal_ticks_perc = (double) steal_ticks_difference / total_ticks_difference;
2362
      }
2363
      st->print_cr("Steal ticks since vm start: " UINT64_FORMAT, steal_ticks_difference);
2364
      st->print_cr("Steal ticks percentage since vm start:%7.3f", steal_ticks_perc);
2365
    }
2366
  }
2367
}
2368

2369
void os::print_memory_info(outputStream* st) {
2370

2371
  st->print("Memory:");
2372
  st->print(" %dk page", os::vm_page_size()>>10);
2373

2374
  // values in struct sysinfo are "unsigned long"
2375
  struct sysinfo si;
2376
  sysinfo(&si);
2377

2378
  st->print(", physical " UINT64_FORMAT "k",
2379
            os::physical_memory() >> 10);
2380
  st->print("(" UINT64_FORMAT "k free)",
2381
            os::available_memory() >> 10);
2382
  st->print(", swap " UINT64_FORMAT "k",
2383
            ((jlong)si.totalswap * si.mem_unit) >> 10);
2384
  st->print("(" UINT64_FORMAT "k free)",
2385
            ((jlong)si.freeswap * si.mem_unit) >> 10);
2386
  st->cr();
2387
  st->print("Page Sizes: ");
2388
  _page_sizes.print_on(st);
2389
  st->cr();
2390
}
2391

2392
// Print the first "model name" line and the first "flags" line
2393
// that we find and nothing more. We assume "model name" comes
2394
// before "flags" so if we find a second "model name", then the
2395
// "flags" field is considered missing.
2396
static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
2397
#if defined(IA32) || defined(AMD64)
2398
  // Other platforms have less repetitive cpuinfo files
2399
  FILE *fp = fopen("/proc/cpuinfo", "r");
2400
  if (fp) {
2401
    bool model_name_printed = false;
2402
    while (!feof(fp)) {
2403
      if (fgets(buf, buflen, fp)) {
2404
        // Assume model name comes before flags
2405
        if (strstr(buf, "model name") != NULL) {
2406
          if (!model_name_printed) {
2407
            st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
2408
            st->print_raw(buf);
2409
            model_name_printed = true;
2410
          } else {
2411
            // model name printed but not flags?  Odd, just return
2412
            fclose(fp);
2413
            return true;
2414
          }
2415
        }
2416
        // print the flags line too
2417
        if (strstr(buf, "flags") != NULL) {
2418
          st->print_raw(buf);
2419
          fclose(fp);
2420
          return true;
2421
        }
2422
      }
2423
    }
2424
    fclose(fp);
2425
  }
2426
#endif // x86 platforms
2427
  return false;
2428
}
2429

2430
// additional information about CPU e.g. available frequency ranges
2431
static void print_sys_devices_cpu_info(outputStream* st, char* buf, size_t buflen) {
2432
  _print_ascii_file_h("Online cpus", "/sys/devices/system/cpu/online", st);
2433
  _print_ascii_file_h("Offline cpus", "/sys/devices/system/cpu/offline", st);
2434

2435
  if (ExtensiveErrorReports) {
2436
    // cache related info (cpu 0, should be similar for other CPUs)
2437
    for (unsigned int i=0; i < 10; i++) { // handle max. 10 cache entries
2438
      char hbuf_level[60];
2439
      char hbuf_type[60];
2440
      char hbuf_size[60];
2441
      char hbuf_coherency_line_size[80];
2442
      snprintf(hbuf_level, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/level", i);
2443
      snprintf(hbuf_type, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/type", i);
2444
      snprintf(hbuf_size, 60, "/sys/devices/system/cpu/cpu0/cache/index%u/size", i);
2445
      snprintf(hbuf_coherency_line_size, 80, "/sys/devices/system/cpu/cpu0/cache/index%u/coherency_line_size", i);
2446
      if (file_exists(hbuf_level)) {
2447
        _print_ascii_file_h("cache level", hbuf_level, st);
2448
        _print_ascii_file_h("cache type", hbuf_type, st);
2449
        _print_ascii_file_h("cache size", hbuf_size, st);
2450
        _print_ascii_file_h("cache coherency line size", hbuf_coherency_line_size, st);
2451
      }
2452
    }
2453
  }
2454

2455
  // we miss the cpufreq entries on Power and s390x
2456
#if defined(IA32) || defined(AMD64)
2457
  _print_ascii_file_h("BIOS frequency limitation", "/sys/devices/system/cpu/cpu0/cpufreq/bios_limit", st);
2458
  _print_ascii_file_h("Frequency switch latency (ns)", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_transition_latency", st);
2459
  _print_ascii_file_h("Available cpu frequencies", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies", st);
2460
  // min and max should be in the Available range but still print them (not all info might be available for all kernels)
2461
  if (ExtensiveErrorReports) {
2462
    _print_ascii_file_h("Maximum cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", st);
2463
    _print_ascii_file_h("Minimum cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq", st);
2464
    _print_ascii_file_h("Current cpu frequency", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", st);
2465
  }
2466
  // governors are power schemes, see https://wiki.archlinux.org/index.php/CPU_frequency_scaling
2467
  if (ExtensiveErrorReports) {
2468
    _print_ascii_file_h("Available governors", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_governors", st);
2469
  }
2470
  _print_ascii_file_h("Current governor", "/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", st);
2471
  // Core performance boost, see https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt
2472
  // Raise operating frequency of some cores in a multi-core package if certain conditions apply, e.g.
2473
  // whole chip is not fully utilized
2474
  _print_ascii_file_h("Core performance/turbo boost", "/sys/devices/system/cpu/cpufreq/boost", st);
2475
#endif
2476
}
2477

2478
void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
2479
  // Only print the model name if the platform provides this as a summary
2480
  if (!print_model_name_and_flags(st, buf, buflen)) {
2481
    _print_ascii_file_h("/proc/cpuinfo", "/proc/cpuinfo", st, false);
2482
  }
2483
  st->cr();
2484
  print_sys_devices_cpu_info(st, buf, buflen);
2485
}
2486

2487
#if defined(AMD64) || defined(IA32) || defined(X32)
2488
const char* search_string = "model name";
2489
#elif defined(M68K)
2490
const char* search_string = "CPU";
2491
#elif defined(PPC64)
2492
const char* search_string = "cpu";
2493
#elif defined(S390)
2494
const char* search_string = "machine =";
2495
#elif defined(SPARC)
2496
const char* search_string = "cpu";
2497
#else
2498
const char* search_string = "Processor";
2499
#endif
2500

2501
// Parses the cpuinfo file for string representing the model name.
2502
void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
2503
  FILE* fp = fopen("/proc/cpuinfo", "r");
2504
  if (fp != NULL) {
2505
    while (!feof(fp)) {
2506
      char buf[256];
2507
      if (fgets(buf, sizeof(buf), fp)) {
2508
        char* start = strstr(buf, search_string);
2509
        if (start != NULL) {
2510
          char *ptr = start + strlen(search_string);
2511
          char *end = buf + strlen(buf);
2512
          while (ptr != end) {
2513
             // skip whitespace and colon for the rest of the name.
2514
             if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') {
2515
               break;
2516
             }
2517
             ptr++;
2518
          }
2519
          if (ptr != end) {
2520
            // reasonable string, get rid of newline and keep the rest
2521
            char* nl = strchr(buf, '\n');
2522
            if (nl != NULL) *nl = '\0';
2523
            strncpy(cpuinfo, ptr, length);
2524
            fclose(fp);
2525
            return;
2526
          }
2527
        }
2528
      }
2529
    }
2530
    fclose(fp);
2531
  }
2532
  // cpuinfo not found or parsing failed, just print generic string.  The entire
2533
  // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
2534
#if   defined(AARCH64)
2535
  strncpy(cpuinfo, "AArch64", length);
2536
#elif defined(AMD64)
2537
  strncpy(cpuinfo, "x86_64", length);
2538
#elif defined(ARM)  // Order wrt. AARCH64 is relevant!
2539
  strncpy(cpuinfo, "ARM", length);
2540
#elif defined(IA32)
2541
  strncpy(cpuinfo, "x86_32", length);
2542
#elif defined(IA64)
2543
  strncpy(cpuinfo, "IA64", length);
2544
#elif defined(PPC)
2545
  strncpy(cpuinfo, "PPC64", length);
2546
#elif defined(S390)
2547
  strncpy(cpuinfo, "S390", length);
2548
#elif defined(SPARC)
2549
  strncpy(cpuinfo, "sparcv9", length);
2550
#elif defined(ZERO_LIBARCH)
2551
  strncpy(cpuinfo, ZERO_LIBARCH, length);
2552
#else
2553
  strncpy(cpuinfo, "unknown", length);
2554
#endif
2555
}
2556

2557
static char saved_jvm_path[MAXPATHLEN] = {0};
2558

2559
// Find the full path to the current module, libjvm.so
2560
void os::jvm_path(char *buf, jint buflen) {
2561
  // Error checking.
2562
  if (buflen < MAXPATHLEN) {
2563
    assert(false, "must use a large-enough buffer");
2564
    buf[0] = '\0';
2565
    return;
2566
  }
2567
  // Lazy resolve the path to current module.
2568
  if (saved_jvm_path[0] != 0) {
2569
    strcpy(buf, saved_jvm_path);
2570
    return;
2571
  }
2572

2573
  char dli_fname[MAXPATHLEN];
2574
  dli_fname[0] = '\0';
2575
  bool ret = dll_address_to_library_name(
2576
                                         CAST_FROM_FN_PTR(address, os::jvm_path),
2577
                                         dli_fname, sizeof(dli_fname), NULL);
2578
  assert(ret, "cannot locate libjvm");
2579
#ifdef __ANDROID__
2580
  if (dli_fname[0] == '\0') {
2581
    return;
2582
  }
2583

2584
  if (strchr(dli_fname, '/') == NULL) {
2585
    bool ok = read_so_path_from_maps(dli_fname, buf, buflen);
2586
    assert(ok, "unable to turn relative libjvm.so path into absolute");
2587
    return;
2588
  }
2589

2590
  snprintf(buf, buflen, /* "%s/lib/%s/server/%s", java_home_var, cpu_arch, */ "%s", dli_fname);
2591
#else // !__ANDROID__
2592
  char *rp = NULL;
2593
  if (ret && dli_fname[0] != '\0') {
2594
    rp = os::Posix::realpath(dli_fname, buf, buflen);
2595
  }
2596
  if (rp == NULL) {
2597
    return;
2598
  }
2599

2600
  if (Arguments::sun_java_launcher_is_altjvm()) {
2601
    // Support for the java launcher's '-XXaltjvm=<path>' option. Typical
2602
    // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so".
2603
    // If "/jre/lib/" appears at the right place in the string, then
2604
    // assume we are installed in a JDK and we're done. Otherwise, check
2605
    // for a JAVA_HOME environment variable and fix up the path so it
2606
    // looks like libjvm.so is installed there (append a fake suffix
2607
    // hotspot/libjvm.so).
2608
    const char *p = buf + strlen(buf) - 1;
2609
    for (int count = 0; p > buf && count < 5; ++count) {
2610
      for (--p; p > buf && *p != '/'; --p)
2611
        /* empty */ ;
2612
    }
2613

2614
    if (strncmp(p, "/jre/lib/", 9) != 0) {
2615
      // Look for JAVA_HOME in the environment.
2616
      char* java_home_var = ::getenv("JAVA_HOME");
2617
      if (java_home_var != NULL && java_home_var[0] != 0) {
2618
        char* jrelib_p;
2619
        int len;
2620

2621
        // Check the current module name "libjvm.so".
2622
        p = strrchr(buf, '/');
2623
        if (p == NULL) {
2624
          return;
2625
        }
2626
        assert(strstr(p, "/libjvm") == p, "invalid library name");
2627

2628
        rp = os::Posix::realpath(java_home_var, buf, buflen);
2629
        if (rp == NULL) {
2630
          return;
2631
        }
2632

2633
        // determine if this is a legacy image or modules image
2634
        // modules image doesn't have "jre" subdirectory
2635
        len = strlen(buf);
2636
        assert(len < buflen, "Ran out of buffer room");
2637
        jrelib_p = buf + len;
2638
        snprintf(jrelib_p, buflen-len, "/jre/lib");
2639
        if (0 != access(buf, F_OK)) {
2640
          snprintf(jrelib_p, buflen-len, "/lib");
2641
        }
2642

2643
        if (0 == access(buf, F_OK)) {
2644
          // Use current module name "libjvm.so"
2645
          len = strlen(buf);
2646
          snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
2647
        } else {
2648
          // Go back to path of .so
2649
          rp = os::Posix::realpath(dli_fname, buf, buflen);
2650
          if (rp == NULL) {
2651
            return;
2652
          }
2653
        }
2654
      }
2655
    }
2656
  }
2657
#endif
2658

2659
  strncpy(saved_jvm_path, buf, MAXPATHLEN);
2660
  saved_jvm_path[MAXPATHLEN - 1] = '\0';
2661
}
2662

2663
void os::print_jni_name_prefix_on(outputStream* st, int args_size) {
2664
  // no prefix required, not even "_"
2665
}
2666

2667
void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
2668
  // no suffix required
2669
}
2670

2671
////////////////////////////////////////////////////////////////////////////////
2672
// Virtual Memory
2673

2674
int os::vm_page_size() {
2675
  // Seems redundant as all get out
2676
  assert(os::Linux::page_size() != -1, "must call os::init");
2677
  return os::Linux::page_size();
2678
}
2679

2680
// Solaris allocates memory by pages.
2681
int os::vm_allocation_granularity() {
2682
  assert(os::Linux::page_size() != -1, "must call os::init");
2683
  return os::Linux::page_size();
2684
}
2685

2686
// Rationale behind this function:
2687
//  current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable
2688
//  mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get
2689
//  samples for JITted code. Here we create private executable mapping over the code cache
2690
//  and then we can use standard (well, almost, as mapping can change) way to provide
2691
//  info for the reporting script by storing timestamp and location of symbol
2692
void linux_wrap_code(char* base, size_t size) {
2693
  static volatile jint cnt = 0;
2694

2695
  if (!UseOprofile) {
2696
    return;
2697
  }
2698

2699
  char buf[PATH_MAX+1];
2700
  int num = Atomic::add(&cnt, 1);
2701

2702
  snprintf(buf, sizeof(buf), "%s/hs-vm-%d-%d",
2703
           os::get_temp_directory(), os::current_process_id(), num);
2704
  unlink(buf);
2705

2706
  int fd = ::open(buf, O_CREAT | O_RDWR, S_IRWXU);
2707

2708
  if (fd != -1) {
2709
    off_t rv = ::lseek(fd, size-2, SEEK_SET);
2710
    if (rv != (off_t)-1) {
2711
      if (::write(fd, "", 1) == 1) {
2712
        mmap(base, size,
2713
             PROT_READ|PROT_WRITE|PROT_EXEC,
2714
             MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE, fd, 0);
2715
      }
2716
    }
2717
    ::close(fd);
2718
    unlink(buf);
2719
  }
2720
}
2721

2722
static bool recoverable_mmap_error(int err) {
2723
  // See if the error is one we can let the caller handle. This
2724
  // list of errno values comes from JBS-6843484. I can't find a
2725
  // Linux man page that documents this specific set of errno
2726
  // values so while this list currently matches Solaris, it may
2727
  // change as we gain experience with this failure mode.
2728
  switch (err) {
2729
  case EBADF:
2730
  case EINVAL:
2731
  case ENOTSUP:
2732
    // let the caller deal with these errors
2733
    return true;
2734

2735
  default:
2736
    // Any remaining errors on this OS can cause our reserved mapping
2737
    // to be lost. That can cause confusion where different data
2738
    // structures think they have the same memory mapped. The worst
2739
    // scenario is if both the VM and a library think they have the
2740
    // same memory mapped.
2741
    return false;
2742
  }
2743
}
2744

2745
static void warn_fail_commit_memory(char* addr, size_t size, bool exec,
2746
                                    int err) {
2747
  warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2748
          ", %d) failed; error='%s' (errno=%d)", p2i(addr), size, exec,
2749
          os::strerror(err), err);
2750
}
2751

2752
static void warn_fail_commit_memory(char* addr, size_t size,
2753
                                    size_t alignment_hint, bool exec,
2754
                                    int err) {
2755
  warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2756
          ", " SIZE_FORMAT ", %d) failed; error='%s' (errno=%d)", p2i(addr), size,
2757
          alignment_hint, exec, os::strerror(err), err);
2758
}
2759

2760
// NOTE: Linux kernel does not really reserve the pages for us.
2761
//       All it does is to check if there are enough free pages
2762
//       left at the time of mmap(). This could be a potential
2763
//       problem.
2764
int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) {
2765
  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
2766
  uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
2767
                                     MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
2768
  if (res != (uintptr_t) MAP_FAILED) {
2769
    if (UseNUMAInterleaving) {
2770
      numa_make_global(addr, size);
2771
    }
2772
    return 0;
2773
  }
2774

2775
  int err = errno;  // save errno from mmap() call above
2776

2777
  if (!recoverable_mmap_error(err)) {
2778
    warn_fail_commit_memory(addr, size, exec, err);
2779
    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "committing reserved memory.");
2780
  }
2781

2782
  return err;
2783
}
2784

2785
bool os::pd_commit_memory(char* addr, size_t size, bool exec) {
2786
  return os::Linux::commit_memory_impl(addr, size, exec) == 0;
2787
}
2788

2789
void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec,
2790
                                  const char* mesg) {
2791
  assert(mesg != NULL, "mesg must be specified");
2792
  int err = os::Linux::commit_memory_impl(addr, size, exec);
2793
  if (err != 0) {
2794
    // the caller wants all commit errors to exit with the specified mesg:
2795
    warn_fail_commit_memory(addr, size, exec, err);
2796
    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2797
  }
2798
}
2799

2800
// Define MAP_HUGETLB here so we can build HotSpot on old systems.
2801
#ifndef MAP_HUGETLB
2802
  #define MAP_HUGETLB 0x40000
2803
#endif
2804

2805
// If mmap flags are set with MAP_HUGETLB and the system supports multiple
2806
// huge page sizes, flag bits [26:31] can be used to encode the log2 of the
2807
// desired huge page size. Otherwise, the system's default huge page size will be used.
2808
// See mmap(2) man page for more info (since Linux 3.8).
2809
// https://lwn.net/Articles/533499/
2810
#ifndef MAP_HUGE_SHIFT
2811
  #define MAP_HUGE_SHIFT 26
2812
#endif
2813

2814
// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
2815
#ifndef MADV_HUGEPAGE
2816
  #define MADV_HUGEPAGE 14
2817
#endif
2818

2819
int os::Linux::commit_memory_impl(char* addr, size_t size,
2820
                                  size_t alignment_hint, bool exec) {
2821
  int err = os::Linux::commit_memory_impl(addr, size, exec);
2822
  if (err == 0) {
2823
    realign_memory(addr, size, alignment_hint);
2824
  }
2825
  return err;
2826
}
2827

2828
bool os::pd_commit_memory(char* addr, size_t size, size_t alignment_hint,
2829
                          bool exec) {
2830
  return os::Linux::commit_memory_impl(addr, size, alignment_hint, exec) == 0;
2831
}
2832

2833
void os::pd_commit_memory_or_exit(char* addr, size_t size,
2834
                                  size_t alignment_hint, bool exec,
2835
                                  const char* mesg) {
2836
  assert(mesg != NULL, "mesg must be specified");
2837
  int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec);
2838
  if (err != 0) {
2839
    // the caller wants all commit errors to exit with the specified mesg:
2840
    warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
2841
    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2842
  }
2843
}
2844

2845
void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
2846
  if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
2847
    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
2848
    // be supported or the memory may already be backed by huge pages.
2849
    ::madvise(addr, bytes, MADV_HUGEPAGE);
2850
  }
2851
}
2852

2853
void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) {
2854
  // This method works by doing an mmap over an existing mmaping and effectively discarding
2855
  // the existing pages. However it won't work for SHM-based large pages that cannot be
2856
  // uncommitted at all. We don't do anything in this case to avoid creating a segment with
2857
  // small pages on top of the SHM segment. This method always works for small pages, so we
2858
  // allow that in any case.
2859
  if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
2860
    commit_memory(addr, bytes, alignment_hint, !ExecMem);
2861
  }
2862
}
2863

2864
void os::numa_make_global(char *addr, size_t bytes) {
2865
  Linux::numa_interleave_memory(addr, bytes);
2866
}
2867

2868
// Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
2869
// bind policy to MPOL_PREFERRED for the current thread.
2870
#define USE_MPOL_PREFERRED 0
2871

2872
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
2873
  // To make NUMA and large pages more robust when both enabled, we need to ease
2874
  // the requirements on where the memory should be allocated. MPOL_BIND is the
2875
  // default policy and it will force memory to be allocated on the specified
2876
  // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
2877
  // the specified node, but will not force it. Using this policy will prevent
2878
  // getting SIGBUS when trying to allocate large pages on NUMA nodes with no
2879
  // free large pages.
2880
  Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
2881
  Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
2882
}
2883

2884
bool os::numa_topology_changed() { return false; }
2885

2886
size_t os::numa_get_groups_num() {
2887
  // Return just the number of nodes in which it's possible to allocate memory
2888
  // (in numa terminology, configured nodes).
2889
  return Linux::numa_num_configured_nodes();
2890
}
2891

2892
int os::numa_get_group_id() {
2893
  int cpu_id = Linux::sched_getcpu();
2894
  if (cpu_id != -1) {
2895
    int lgrp_id = Linux::get_node_by_cpu(cpu_id);
2896
    if (lgrp_id != -1) {
2897
      return lgrp_id;
2898
    }
2899
  }
2900
  return 0;
2901
}
2902

2903
int os::numa_get_group_id_for_address(const void* address) {
2904
  void** pages = const_cast<void**>(&address);
2905
  int id = -1;
2906

2907
  if (os::Linux::numa_move_pages(0, 1, pages, NULL, &id, 0) == -1) {
2908
    return -1;
2909
  }
2910
  if (id < 0) {
2911
    return -1;
2912
  }
2913
  return id;
2914
}
2915

2916
int os::Linux::get_existing_num_nodes() {
2917
  int node;
2918
  int highest_node_number = Linux::numa_max_node();
2919
  int num_nodes = 0;
2920

2921
  // Get the total number of nodes in the system including nodes without memory.
2922
  for (node = 0; node <= highest_node_number; node++) {
2923
    if (is_node_in_existing_nodes(node)) {
2924
      num_nodes++;
2925
    }
2926
  }
2927
  return num_nodes;
2928
}
2929

2930
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
2931
  int highest_node_number = Linux::numa_max_node();
2932
  size_t i = 0;
2933

2934
  // Map all node ids in which it is possible to allocate memory. Also nodes are
2935
  // not always consecutively available, i.e. available from 0 to the highest
2936
  // node number. If the nodes have been bound explicitly using numactl membind,
2937
  // then allocate memory from those nodes only.
2938
  for (int node = 0; node <= highest_node_number; node++) {
2939
    if (Linux::is_node_in_bound_nodes((unsigned int)node)) {
2940
      ids[i++] = node;
2941
    }
2942
  }
2943
  return i;
2944
}
2945

2946
bool os::get_page_info(char *start, page_info* info) {
2947
  return false;
2948
}
2949

2950
char *os::scan_pages(char *start, char* end, page_info* page_expected,
2951
                     page_info* page_found) {
2952
  return end;
2953
}
2954

2955

2956
int os::Linux::sched_getcpu_syscall(void) {
2957
  unsigned int cpu = 0;
2958
  int retval = -1;
2959

2960
#if defined(IA32)
2961
  #ifndef SYS_getcpu
2962
    #define SYS_getcpu 318
2963
  #endif
2964
  retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
2965
#elif defined(AMD64)
2966
// Unfortunately we have to bring all these macros here from vsyscall.h
2967
// to be able to compile on old linuxes.
2968
  #define __NR_vgetcpu 2
2969
  #define VSYSCALL_START (-10UL << 20)
2970
  #define VSYSCALL_SIZE 1024
2971
  #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
2972
  typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
2973
  vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
2974
  retval = vgetcpu(&cpu, NULL, NULL);
2975
#endif
2976

2977
  return (retval == -1) ? retval : cpu;
2978
}
2979

2980
void os::Linux::sched_getcpu_init() {
2981
  // sched_getcpu() should be in libc.
2982
  set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2983
                                  dlsym(RTLD_DEFAULT, "sched_getcpu")));
2984

2985
  // If it's not, try a direct syscall.
2986
  if (sched_getcpu() == -1) {
2987
    set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2988
                                    (void*)&sched_getcpu_syscall));
2989
  }
2990

2991
  if (sched_getcpu() == -1) {
2992
    // vm_exit_during_initialization
2993
    warning("getcpu(2) system call not supported by kernel");
2994
  }
2995
}
2996

2997
// Something to do with the numa-aware allocator needs these symbols
2998
extern "C" JNIEXPORT void numa_warn(int number, char *where, ...) { }
2999
extern "C" JNIEXPORT void numa_error(char *where) { }
3000

3001
// Handle request to load libnuma symbol version 1.1 (API v1). If it fails
3002
// load symbol from base version instead.
3003
void* os::Linux::libnuma_dlsym(void* handle, const char *name) {
3004
  void *f = dlvsym(handle, name, "libnuma_1.1");
3005
  if (f == NULL) {
3006
    f = dlsym(handle, name);
3007
  }
3008
  return f;
3009
}
3010

3011
// Handle request to load libnuma symbol version 1.2 (API v2) only.
3012
// Return NULL if the symbol is not defined in this particular version.
3013
void* os::Linux::libnuma_v2_dlsym(void* handle, const char* name) {
3014
  return dlvsym(handle, name, "libnuma_1.2");
3015
}
3016

3017
// Check numa dependent syscalls
3018
static bool numa_syscall_check() {
3019
  // NUMA APIs depend on several syscalls. E.g., get_mempolicy is required for numa_get_membind and
3020
  // numa_get_interleave_mask. But these dependent syscalls can be unsupported for various reasons.
3021
  // Especially in dockers, get_mempolicy is not allowed with the default configuration. So it's necessary
3022
  // to check whether the syscalls are available. Currently, only get_mempolicy is checked since checking
3023
  // others like mbind would cause unexpected side effects.
3024
#ifdef SYS_get_mempolicy
3025
  int dummy = 0;
3026
  if (syscall(SYS_get_mempolicy, &dummy, NULL, 0, (void*)&dummy, 3) == -1) {
3027
    return false;
3028
  }
3029
#endif
3030

3031
  return true;
3032
}
3033

3034
bool os::Linux::libnuma_init() {
3035
  // Requires sched_getcpu() and numa dependent syscalls support
3036
  if ((sched_getcpu() != -1) && numa_syscall_check()) {
3037
    void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
3038
    if (handle != NULL) {
3039
      set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
3040
                                           libnuma_dlsym(handle, "numa_node_to_cpus")));
3041
      set_numa_node_to_cpus_v2(CAST_TO_FN_PTR(numa_node_to_cpus_v2_func_t,
3042
                                              libnuma_v2_dlsym(handle, "numa_node_to_cpus")));
3043
      set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
3044
                                       libnuma_dlsym(handle, "numa_max_node")));
3045
      set_numa_num_configured_nodes(CAST_TO_FN_PTR(numa_num_configured_nodes_func_t,
3046
                                                   libnuma_dlsym(handle, "numa_num_configured_nodes")));
3047
      set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
3048
                                        libnuma_dlsym(handle, "numa_available")));
3049
      set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
3050
                                            libnuma_dlsym(handle, "numa_tonode_memory")));
3051
      set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
3052
                                                libnuma_dlsym(handle, "numa_interleave_memory")));
3053
      set_numa_interleave_memory_v2(CAST_TO_FN_PTR(numa_interleave_memory_v2_func_t,
3054
                                                libnuma_v2_dlsym(handle, "numa_interleave_memory")));
3055
      set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
3056
                                              libnuma_dlsym(handle, "numa_set_bind_policy")));
3057
      set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t,
3058
                                               libnuma_dlsym(handle, "numa_bitmask_isbitset")));
3059
      set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
3060
                                       libnuma_dlsym(handle, "numa_distance")));
3061
      set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
3062
                                          libnuma_v2_dlsym(handle, "numa_get_membind")));
3063
      set_numa_get_interleave_mask(CAST_TO_FN_PTR(numa_get_interleave_mask_func_t,
3064
                                                  libnuma_v2_dlsym(handle, "numa_get_interleave_mask")));
3065
      set_numa_move_pages(CAST_TO_FN_PTR(numa_move_pages_func_t,
3066
                                         libnuma_dlsym(handle, "numa_move_pages")));
3067
      set_numa_set_preferred(CAST_TO_FN_PTR(numa_set_preferred_func_t,
3068
                                            libnuma_dlsym(handle, "numa_set_preferred")));
3069

3070
      if (numa_available() != -1) {
3071
        set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
3072
        set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
3073
        set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
3074
        set_numa_interleave_bitmask(_numa_get_interleave_mask());
3075
        set_numa_membind_bitmask(_numa_get_membind());
3076
        // Create an index -> node mapping, since nodes are not always consecutive
3077
        _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, mtInternal);
3078
        rebuild_nindex_to_node_map();
3079
        // Create a cpu -> node mapping
3080
        _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, mtInternal);
3081
        rebuild_cpu_to_node_map();
3082
        return true;
3083
      }
3084
    }
3085
  }
3086
  return false;
3087
}
3088

3089
size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
3090
  // Creating guard page is very expensive. Java thread has HotSpot
3091
  // guard pages, only enable glibc guard page for non-Java threads.
3092
  // (Remember: compiler thread is a Java thread, too!)
3093
  return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : page_size());
3094
}
3095

3096
void os::Linux::rebuild_nindex_to_node_map() {
3097
  int highest_node_number = Linux::numa_max_node();
3098

3099
  nindex_to_node()->clear();
3100
  for (int node = 0; node <= highest_node_number; node++) {
3101
    if (Linux::is_node_in_existing_nodes(node)) {
3102
      nindex_to_node()->append(node);
3103
    }
3104
  }
3105
}
3106

3107
// rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
3108
// The table is later used in get_node_by_cpu().
3109
void os::Linux::rebuild_cpu_to_node_map() {
3110
  const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
3111
                              // in libnuma (possible values are starting from 16,
3112
                              // and continuing up with every other power of 2, but less
3113
                              // than the maximum number of CPUs supported by kernel), and
3114
                              // is a subject to change (in libnuma version 2 the requirements
3115
                              // are more reasonable) we'll just hardcode the number they use
3116
                              // in the library.
3117
  const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
3118

3119
  size_t cpu_num = processor_count();
3120
  size_t cpu_map_size = NCPUS / BitsPerCLong;
3121
  size_t cpu_map_valid_size =
3122
    MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
3123

3124
  cpu_to_node()->clear();
3125
  cpu_to_node()->at_grow(cpu_num - 1);
3126

3127
  size_t node_num = get_existing_num_nodes();
3128

3129
  int distance = 0;
3130
  int closest_distance = INT_MAX;
3131
  int closest_node = 0;
3132
  unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
3133
  for (size_t i = 0; i < node_num; i++) {
3134
    // Check if node is configured (not a memory-less node). If it is not, find
3135
    // the closest configured node. Check also if node is bound, i.e. it's allowed
3136
    // to allocate memory from the node. If it's not allowed, map cpus in that node
3137
    // to the closest node from which memory allocation is allowed.
3138
    if (!is_node_in_configured_nodes(nindex_to_node()->at(i)) ||
3139
        !is_node_in_bound_nodes(nindex_to_node()->at(i))) {
3140
      closest_distance = INT_MAX;
3141
      // Check distance from all remaining nodes in the system. Ignore distance
3142
      // from itself, from another non-configured node, and from another non-bound
3143
      // node.
3144
      for (size_t m = 0; m < node_num; m++) {
3145
        if (m != i &&
3146
            is_node_in_configured_nodes(nindex_to_node()->at(m)) &&
3147
            is_node_in_bound_nodes(nindex_to_node()->at(m))) {
3148
          distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m));
3149
          // If a closest node is found, update. There is always at least one
3150
          // configured and bound node in the system so there is always at least
3151
          // one node close.
3152
          if (distance != 0 && distance < closest_distance) {
3153
            closest_distance = distance;
3154
            closest_node = nindex_to_node()->at(m);
3155
          }
3156
        }
3157
      }
3158
     } else {
3159
       // Current node is already a configured node.
3160
       closest_node = nindex_to_node()->at(i);
3161
     }
3162

3163
    // Get cpus from the original node and map them to the closest node. If node
3164
    // is a configured node (not a memory-less node), then original node and
3165
    // closest node are the same.
3166
    if (numa_node_to_cpus(nindex_to_node()->at(i), cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
3167
      for (size_t j = 0; j < cpu_map_valid_size; j++) {
3168
        if (cpu_map[j] != 0) {
3169
          for (size_t k = 0; k < BitsPerCLong; k++) {
3170
            if (cpu_map[j] & (1UL << k)) {
3171
              int cpu_index = j * BitsPerCLong + k;
3172

3173
#ifndef PRODUCT
3174
              if (UseDebuggerErgo1 && cpu_index >= (int)cpu_num) {
3175
                // Some debuggers limit the processor count without
3176
                // intercepting the NUMA APIs. Just fake the values.
3177
                cpu_index = 0;
3178
              }
3179
#endif
3180

3181
              cpu_to_node()->at_put(cpu_index, closest_node);
3182
            }
3183
          }
3184
        }
3185
      }
3186
    }
3187
  }
3188
  FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
3189
}
3190

3191
int os::Linux::numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) {
3192
  // use the latest version of numa_node_to_cpus if available
3193
  if (_numa_node_to_cpus_v2 != NULL) {
3194

3195
    // libnuma bitmask struct
3196
    struct bitmask {
3197
      unsigned long size; /* number of bits in the map */
3198
      unsigned long *maskp;
3199
    };
3200

3201
    struct bitmask mask;
3202
    mask.maskp = (unsigned long *)buffer;
3203
    mask.size = bufferlen * 8;
3204
    return _numa_node_to_cpus_v2(node, &mask);
3205
  } else if (_numa_node_to_cpus != NULL) {
3206
    return _numa_node_to_cpus(node, buffer, bufferlen);
3207
  }
3208
  return -1;
3209
}
3210

3211
int os::Linux::get_node_by_cpu(int cpu_id) {
3212
  if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
3213
    return cpu_to_node()->at(cpu_id);
3214
  }
3215
  return -1;
3216
}
3217

3218
GrowableArray<int>* os::Linux::_cpu_to_node;
3219
GrowableArray<int>* os::Linux::_nindex_to_node;
3220
os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
3221
os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
3222
os::Linux::numa_node_to_cpus_v2_func_t os::Linux::_numa_node_to_cpus_v2;
3223
os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
3224
os::Linux::numa_num_configured_nodes_func_t os::Linux::_numa_num_configured_nodes;
3225
os::Linux::numa_available_func_t os::Linux::_numa_available;
3226
os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
3227
os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
3228
os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v2;
3229
os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
3230
os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
3231
os::Linux::numa_distance_func_t os::Linux::_numa_distance;
3232
os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
3233
os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask;
3234
os::Linux::numa_move_pages_func_t os::Linux::_numa_move_pages;
3235
os::Linux::numa_set_preferred_func_t os::Linux::_numa_set_preferred;
3236
os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy;
3237
unsigned long* os::Linux::_numa_all_nodes;
3238
struct bitmask* os::Linux::_numa_all_nodes_ptr;
3239
struct bitmask* os::Linux::_numa_nodes_ptr;
3240
struct bitmask* os::Linux::_numa_interleave_bitmask;
3241
struct bitmask* os::Linux::_numa_membind_bitmask;
3242

3243
bool os::pd_uncommit_memory(char* addr, size_t size, bool exec) {
3244
  uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
3245
                                     MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
3246
  return res  != (uintptr_t) MAP_FAILED;
3247
}
3248

3249
static address get_stack_commited_bottom(address bottom, size_t size) {
3250
  address nbot = bottom;
3251
  address ntop = bottom + size;
3252

3253
  size_t page_sz = os::vm_page_size();
3254
  unsigned pages = size / page_sz;
3255

3256
  unsigned char vec[1];
3257
  unsigned imin = 1, imax = pages + 1, imid;
3258
  int mincore_return_value = 0;
3259

3260
  assert(imin <= imax, "Unexpected page size");
3261

3262
  while (imin < imax) {
3263
    imid = (imax + imin) / 2;
3264
    nbot = ntop - (imid * page_sz);
3265

3266
    // Use a trick with mincore to check whether the page is mapped or not.
3267
    // mincore sets vec to 1 if page resides in memory and to 0 if page
3268
    // is swapped output but if page we are asking for is unmapped
3269
    // it returns -1,ENOMEM
3270
    mincore_return_value = mincore(nbot, page_sz, vec);
3271

3272
    if (mincore_return_value == -1) {
3273
      // Page is not mapped go up
3274
      // to find first mapped page
3275
      if (errno != EAGAIN) {
3276
        assert(errno == ENOMEM, "Unexpected mincore errno");
3277
        imax = imid;
3278
      }
3279
    } else {
3280
      // Page is mapped go down
3281
      // to find first not mapped page
3282
      imin = imid + 1;
3283
    }
3284
  }
3285

3286
  nbot = nbot + page_sz;
3287

3288
  // Adjust stack bottom one page up if last checked page is not mapped
3289
  if (mincore_return_value == -1) {
3290
    nbot = nbot + page_sz;
3291
  }
3292

3293
  return nbot;
3294
}
3295

3296
bool os::committed_in_range(address start, size_t size, address& committed_start, size_t& committed_size) {
3297
  int mincore_return_value;
3298
  const size_t stripe = 1024;  // query this many pages each time
3299
  unsigned char vec[stripe + 1];
3300
  // set a guard
3301
  vec[stripe] = 'X';
3302

3303
  const size_t page_sz = os::vm_page_size();
3304
  size_t pages = size / page_sz;
3305

3306
  assert(is_aligned(start, page_sz), "Start address must be page aligned");
3307
  assert(is_aligned(size, page_sz), "Size must be page aligned");
3308

3309
  committed_start = NULL;
3310

3311
  int loops = (pages + stripe - 1) / stripe;
3312
  int committed_pages = 0;
3313
  address loop_base = start;
3314
  bool found_range = false;
3315

3316
  for (int index = 0; index < loops && !found_range; index ++) {
3317
    assert(pages > 0, "Nothing to do");
3318
    int pages_to_query = (pages >= stripe) ? stripe : pages;
3319
    pages -= pages_to_query;
3320

3321
    // Get stable read
3322
    while ((mincore_return_value = mincore(loop_base, pages_to_query * page_sz, vec)) == -1 && errno == EAGAIN);
3323

3324
    // During shutdown, some memory goes away without properly notifying NMT,
3325
    // E.g. ConcurrentGCThread/WatcherThread can exit without deleting thread object.
3326
    // Bailout and return as not committed for now.
3327
    if (mincore_return_value == -1 && errno == ENOMEM) {
3328
      return false;
3329
    }
3330

3331
    assert(vec[stripe] == 'X', "overflow guard");
3332
    assert(mincore_return_value == 0, "Range must be valid");
3333
    // Process this stripe
3334
    for (int vecIdx = 0; vecIdx < pages_to_query; vecIdx ++) {
3335
      if ((vec[vecIdx] & 0x01) == 0) { // not committed
3336
        // End of current contiguous region
3337
        if (committed_start != NULL) {
3338
          found_range = true;
3339
          break;
3340
        }
3341
      } else { // committed
3342
        // Start of region
3343
        if (committed_start == NULL) {
3344
          committed_start = loop_base + page_sz * vecIdx;
3345
        }
3346
        committed_pages ++;
3347
      }
3348
    }
3349

3350
    loop_base += pages_to_query * page_sz;
3351
  }
3352

3353
  if (committed_start != NULL) {
3354
    assert(committed_pages > 0, "Must have committed region");
3355
    assert(committed_pages <= int(size / page_sz), "Can not commit more than it has");
3356
    assert(committed_start >= start && committed_start < start + size, "Out of range");
3357
    committed_size = page_sz * committed_pages;
3358
    return true;
3359
  } else {
3360
    assert(committed_pages == 0, "Should not have committed region");
3361
    return false;
3362
  }
3363
}
3364

3365

3366
// Linux uses a growable mapping for the stack, and if the mapping for
3367
// the stack guard pages is not removed when we detach a thread the
3368
// stack cannot grow beyond the pages where the stack guard was
3369
// mapped.  If at some point later in the process the stack expands to
3370
// that point, the Linux kernel cannot expand the stack any further
3371
// because the guard pages are in the way, and a segfault occurs.
3372
//
3373
// However, it's essential not to split the stack region by unmapping
3374
// a region (leaving a hole) that's already part of the stack mapping,
3375
// so if the stack mapping has already grown beyond the guard pages at
3376
// the time we create them, we have to truncate the stack mapping.
3377
// So, we need to know the extent of the stack mapping when
3378
// create_stack_guard_pages() is called.
3379

3380
// We only need this for stacks that are growable: at the time of
3381
// writing thread stacks don't use growable mappings (i.e. those
3382
// creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
3383
// only applies to the main thread.
3384

3385
// If the (growable) stack mapping already extends beyond the point
3386
// where we're going to put our guard pages, truncate the mapping at
3387
// that point by munmap()ping it.  This ensures that when we later
3388
// munmap() the guard pages we don't leave a hole in the stack
3389
// mapping. This only affects the main/primordial thread
3390

3391
bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
3392
  if (os::is_primordial_thread()) {
3393
    // As we manually grow stack up to bottom inside create_attached_thread(),
3394
    // it's likely that os::Linux::initial_thread_stack_bottom is mapped and
3395
    // we don't need to do anything special.
3396
    // Check it first, before calling heavy function.
3397
    uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
3398
    unsigned char vec[1];
3399

3400
    if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
3401
      // Fallback to slow path on all errors, including EAGAIN
3402
      stack_extent = (uintptr_t) get_stack_commited_bottom(
3403
                                                           os::Linux::initial_thread_stack_bottom(),
3404
                                                           (size_t)addr - stack_extent);
3405
    }
3406

3407
    if (stack_extent < (uintptr_t)addr) {
3408
      ::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
3409
    }
3410
  }
3411

3412
  return os::commit_memory(addr, size, !ExecMem);
3413
}
3414

3415
// If this is a growable mapping, remove the guard pages entirely by
3416
// munmap()ping them.  If not, just call uncommit_memory(). This only
3417
// affects the main/primordial thread, but guard against future OS changes.
3418
// It's safe to always unmap guard pages for primordial thread because we
3419
// always place it right after end of the mapped region.
3420

3421
bool os::remove_stack_guard_pages(char* addr, size_t size) {
3422
  uintptr_t stack_extent, stack_base;
3423

3424
  if (os::is_primordial_thread()) {
3425
    return ::munmap(addr, size) == 0;
3426
  }
3427

3428
  return os::uncommit_memory(addr, size);
3429
}
3430

3431
// 'requested_addr' is only treated as a hint, the return value may or
3432
// may not start from the requested address. Unlike Linux mmap(), this
3433
// function returns NULL to indicate failure.
3434
static char* anon_mmap(char* requested_addr, size_t bytes) {
3435
  // MAP_FIXED is intentionally left out, to leave existing mappings intact.
3436
  const int flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
3437

3438
  // Map reserved/uncommitted pages PROT_NONE so we fail early if we
3439
  // touch an uncommitted page. Otherwise, the read/write might
3440
  // succeed if we have enough swap space to back the physical page.
3441
  char* addr = (char*)::mmap(requested_addr, bytes, PROT_NONE, flags, -1, 0);
3442

3443
  return addr == MAP_FAILED ? NULL : addr;
3444
}
3445

3446
// Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address
3447
//   (req_addr != NULL) or with a given alignment.
3448
//  - bytes shall be a multiple of alignment.
3449
//  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3450
//  - alignment sets the alignment at which memory shall be allocated.
3451
//     It must be a multiple of allocation granularity.
3452
// Returns address of memory or NULL. If req_addr was not NULL, will only return
3453
//  req_addr or NULL.
3454
static char* anon_mmap_aligned(char* req_addr, size_t bytes, size_t alignment) {
3455
  size_t extra_size = bytes;
3456
  if (req_addr == NULL && alignment > 0) {
3457
    extra_size += alignment;
3458
  }
3459

3460
  char* start = anon_mmap(req_addr, extra_size);
3461
  if (start != NULL) {
3462
    if (req_addr != NULL) {
3463
      if (start != req_addr) {
3464
        ::munmap(start, extra_size);
3465
        start = NULL;
3466
      }
3467
    } else {
3468
      char* const start_aligned = align_up(start, alignment);
3469
      char* const end_aligned = start_aligned + bytes;
3470
      char* const end = start + extra_size;
3471
      if (start_aligned > start) {
3472
        ::munmap(start, start_aligned - start);
3473
      }
3474
      if (end_aligned < end) {
3475
        ::munmap(end_aligned, end - end_aligned);
3476
      }
3477
      start = start_aligned;
3478
    }
3479
  }
3480
  return start;
3481
}
3482

3483
static int anon_munmap(char * addr, size_t size) {
3484
  return ::munmap(addr, size) == 0;
3485
}
3486

3487
char* os::pd_reserve_memory(size_t bytes, bool exec) {
3488
  return anon_mmap(NULL, bytes);
3489
}
3490

3491
bool os::pd_release_memory(char* addr, size_t size) {
3492
  return anon_munmap(addr, size);
3493
}
3494

3495
#ifdef CAN_SHOW_REGISTERS_ON_ASSERT
3496
extern char* g_assert_poison; // assertion poison page address
3497
#endif
3498

3499
static bool linux_mprotect(char* addr, size_t size, int prot) {
3500
  // Linux wants the mprotect address argument to be page aligned.
3501
  char* bottom = (char*)align_down((intptr_t)addr, os::Linux::page_size());
3502

3503
  // According to SUSv3, mprotect() should only be used with mappings
3504
  // established by mmap(), and mmap() always maps whole pages. Unaligned
3505
  // 'addr' likely indicates problem in the VM (e.g. trying to change
3506
  // protection of malloc'ed or statically allocated memory). Check the
3507
  // caller if you hit this assert.
3508
  assert(addr == bottom, "sanity check");
3509

3510
  size = align_up(pointer_delta(addr, bottom, 1) + size, os::Linux::page_size());
3511
  // Don't log anything if we're executing in the poison page signal handling
3512
  // context. It can lead to reentrant use of other parts of the VM code.
3513
#ifdef CAN_SHOW_REGISTERS_ON_ASSERT
3514
  if (addr != g_assert_poison)
3515
#endif
3516
  Events::log(NULL, "Protecting memory [" INTPTR_FORMAT "," INTPTR_FORMAT "] with protection modes %x", p2i(bottom), p2i(bottom+size), prot);
3517
  return ::mprotect(bottom, size, prot) == 0;
3518
}
3519

3520
// Set protections specified
3521
bool os::protect_memory(char* addr, size_t bytes, ProtType prot,
3522
                        bool is_committed) {
3523
  unsigned int p = 0;
3524
  switch (prot) {
3525
  case MEM_PROT_NONE: p = PROT_NONE; break;
3526
  case MEM_PROT_READ: p = PROT_READ; break;
3527
  case MEM_PROT_RW:   p = PROT_READ|PROT_WRITE; break;
3528
  case MEM_PROT_RWX:  p = PROT_READ|PROT_WRITE|PROT_EXEC; break;
3529
  default:
3530
    ShouldNotReachHere();
3531
  }
3532
  // is_committed is unused.
3533
  return linux_mprotect(addr, bytes, p);
3534
}
3535

3536
bool os::guard_memory(char* addr, size_t size) {
3537
  return linux_mprotect(addr, size, PROT_NONE);
3538
}
3539

3540
bool os::unguard_memory(char* addr, size_t size) {
3541
  return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
3542
}
3543

3544
bool os::Linux::transparent_huge_pages_sanity_check(bool warn,
3545
                                                    size_t page_size) {
3546
  bool result = false;
3547
  void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
3548
                 MAP_ANONYMOUS|MAP_PRIVATE,
3549
                 -1, 0);
3550
  if (p != MAP_FAILED) {
3551
    void *aligned_p = align_up(p, page_size);
3552

3553
    result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
3554

3555
    munmap(p, page_size * 2);
3556
  }
3557

3558
  if (warn && !result) {
3559
    warning("TransparentHugePages is not supported by the operating system.");
3560
  }
3561

3562
  return result;
3563
}
3564

3565
int os::Linux::hugetlbfs_page_size_flag(size_t page_size) {
3566
  if (page_size != default_large_page_size()) {
3567
    return (exact_log2(page_size) << MAP_HUGE_SHIFT);
3568
  }
3569
  return 0;
3570
}
3571

3572
bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
3573
  // Include the page size flag to ensure we sanity check the correct page size.
3574
  int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | hugetlbfs_page_size_flag(page_size);
3575
  void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE, flags, -1, 0);
3576

3577
  if (p != MAP_FAILED) {
3578
    // Mapping succeeded, sanity check passed.
3579
    munmap(p, page_size);
3580
    return true;
3581
  } else {
3582
      log_info(pagesize)("Large page size (" SIZE_FORMAT "%s) failed sanity check, "
3583
                         "checking if smaller large page sizes are usable",
3584
                         byte_size_in_exact_unit(page_size),
3585
                         exact_unit_for_byte_size(page_size));
3586
      for (size_t page_size_ = _page_sizes.next_smaller(page_size);
3587
          page_size_ != (size_t)os::vm_page_size();
3588
          page_size_ = _page_sizes.next_smaller(page_size_)) {
3589
        flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | hugetlbfs_page_size_flag(page_size_);
3590
        p = mmap(NULL, page_size_, PROT_READ|PROT_WRITE, flags, -1, 0);
3591
        if (p != MAP_FAILED) {
3592
          // Mapping succeeded, sanity check passed.
3593
          munmap(p, page_size_);
3594
          log_info(pagesize)("Large page size (" SIZE_FORMAT "%s) passed sanity check",
3595
                             byte_size_in_exact_unit(page_size_),
3596
                             exact_unit_for_byte_size(page_size_));
3597
          return true;
3598
        }
3599
      }
3600
  }
3601

3602
  if (warn) {
3603
    warning("HugeTLBFS is not configured or not supported by the operating system.");
3604
  }
3605

3606
  return false;
3607
}
3608

3609
bool os::Linux::shm_hugetlbfs_sanity_check(bool warn, size_t page_size) {
3610
#ifndef __ANDROID__
3611
  // Try to create a large shared memory segment.
3612
  int shmid = shmget(IPC_PRIVATE, page_size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
3613
  if (shmid == -1) {
3614
    // Possible reasons for shmget failure:
3615
    // 1. shmmax is too small for the request.
3616
    //    > check shmmax value: cat /proc/sys/kernel/shmmax
3617
    //    > increase shmmax value: echo "new_value" > /proc/sys/kernel/shmmax
3618
    // 2. not enough large page memory.
3619
    //    > check available large pages: cat /proc/meminfo
3620
    //    > increase amount of large pages:
3621
    //          sysctl -w vm.nr_hugepages=new_value
3622
    //    > For more information regarding large pages please refer to:
3623
    //      https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
3624
    if (warn) {
3625
      warning("Large pages using UseSHM are not configured on this system.");
3626
    }
3627
    return false;
3628
  }
3629
  // Managed to create a segment, now delete it.
3630
  shmctl(shmid, IPC_RMID, NULL);
3631
  return true;
3632
#else
3633
  warning("UseSHM not supported on this platform");
3634
  return false;
3635
#endif
3636
}
3637

3638
// From the coredump_filter documentation:
3639
//
3640
// - (bit 0) anonymous private memory
3641
// - (bit 1) anonymous shared memory
3642
// - (bit 2) file-backed private memory
3643
// - (bit 3) file-backed shared memory
3644
// - (bit 4) ELF header pages in file-backed private memory areas (it is
3645
//           effective only if the bit 2 is cleared)
3646
// - (bit 5) hugetlb private memory
3647
// - (bit 6) hugetlb shared memory
3648
// - (bit 7) dax private memory
3649
// - (bit 8) dax shared memory
3650
//
3651
static void set_coredump_filter(CoredumpFilterBit bit) {
3652
  FILE *f;
3653
  long cdm;
3654

3655
  if ((f = fopen("/proc/self/coredump_filter", "r+")) == NULL) {
3656
    return;
3657
  }
3658

3659
  if (fscanf(f, "%lx", &cdm) != 1) {
3660
    fclose(f);
3661
    return;
3662
  }
3663

3664
  long saved_cdm = cdm;
3665
  rewind(f);
3666
  cdm |= bit;
3667

3668
  if (cdm != saved_cdm) {
3669
    fprintf(f, "%#lx", cdm);
3670
  }
3671

3672
  fclose(f);
3673
}
3674

3675
// Large page support
3676

3677
static size_t _large_page_size = 0;
3678

3679
static size_t scan_default_large_page_size() {
3680
  size_t default_large_page_size = 0;
3681

3682
  // large_page_size on Linux is used to round up heap size. x86 uses either
3683
  // 2M or 4M page, depending on whether PAE (Physical Address Extensions)
3684
  // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
3685
  // page as large as 1G.
3686
  //
3687
  // Here we try to figure out page size by parsing /proc/meminfo and looking
3688
  // for a line with the following format:
3689
  //    Hugepagesize:     2048 kB
3690
  //
3691
  // If we can't determine the value (e.g. /proc is not mounted, or the text
3692
  // format has been changed), we'll set largest page size to 0
3693

3694
  FILE *fp = fopen("/proc/meminfo", "r");
3695
  if (fp) {
3696
    while (!feof(fp)) {
3697
      int x = 0;
3698
      char buf[16];
3699
      if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
3700
        if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
3701
          default_large_page_size = x * K;
3702
          break;
3703
        }
3704
      } else {
3705
        // skip to next line
3706
        for (;;) {
3707
          int ch = fgetc(fp);
3708
          if (ch == EOF || ch == (int)'\n') break;
3709
        }
3710
      }
3711
    }
3712
    fclose(fp);
3713
  }
3714

3715
  return default_large_page_size;
3716
}
3717

3718
static os::PageSizes scan_multiple_page_support() {
3719
  // Scan /sys/kernel/mm/hugepages
3720
  // to discover the available page sizes
3721
  const char* sys_hugepages = "/sys/kernel/mm/hugepages";
3722
  os::PageSizes page_sizes;
3723

3724
  DIR *dir = opendir(sys_hugepages);
3725

3726
  struct dirent *entry;
3727
  size_t page_size;
3728
  while ((entry = readdir(dir)) != NULL) {
3729
    if (entry->d_type == DT_DIR &&
3730
        sscanf(entry->d_name, "hugepages-%zukB", &page_size) == 1) {
3731
      // The kernel is using kB, hotspot uses bytes
3732
      // Add each found Large Page Size to page_sizes
3733
      page_sizes.add(page_size * K);
3734
    }
3735
  }
3736
  closedir(dir);
3737

3738
  LogTarget(Debug, pagesize) lt;
3739
  if (lt.is_enabled()) {
3740
    LogStream ls(lt);
3741
    ls.print("Large Page sizes: ");
3742
    page_sizes.print_on(&ls);
3743
  }
3744

3745
  return page_sizes;
3746
}
3747

3748
size_t os::Linux::default_large_page_size() {
3749
  return _default_large_page_size;
3750
}
3751

3752
void warn_no_large_pages_configured() {
3753
  if (!FLAG_IS_DEFAULT(UseLargePages)) {
3754
    log_warning(pagesize)("UseLargePages disabled, no large pages configured and available on the system.");
3755
  }
3756
}
3757

3758
bool os::Linux::setup_large_page_type(size_t page_size) {
3759
  if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
3760
      FLAG_IS_DEFAULT(UseSHM) &&
3761
      FLAG_IS_DEFAULT(UseTransparentHugePages)) {
3762

3763
    // The type of large pages has not been specified by the user.
3764

3765
    // Try UseHugeTLBFS and then UseSHM.
3766
    UseHugeTLBFS = UseSHM = true;
3767

3768
    // Don't try UseTransparentHugePages since there are known
3769
    // performance issues with it turned on. This might change in the future.
3770
    UseTransparentHugePages = false;
3771
  }
3772

3773
  if (UseTransparentHugePages) {
3774
    bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
3775
    if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
3776
      UseHugeTLBFS = false;
3777
      UseSHM = false;
3778
      return true;
3779
    }
3780
    UseTransparentHugePages = false;
3781
  }
3782

3783
  if (UseHugeTLBFS) {
3784
    bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
3785
    if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3786
      UseSHM = false;
3787
      return true;
3788
    }
3789
    UseHugeTLBFS = false;
3790
  }
3791

3792
  if (UseSHM) {
3793
    bool warn_on_failure = !FLAG_IS_DEFAULT(UseSHM);
3794
    if (shm_hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3795
      return true;
3796
    }
3797
    UseSHM = false;
3798
  }
3799

3800
  warn_no_large_pages_configured();
3801
  return false;
3802
}
3803

3804
void os::large_page_init() {
3805
  // 1) Handle the case where we do not want to use huge pages and hence
3806
  //    there is no need to scan the OS for related info
3807
  if (!UseLargePages &&
3808
      !UseTransparentHugePages &&
3809
      !UseHugeTLBFS &&
3810
      !UseSHM) {
3811
    // Not using large pages.
3812
    return;
3813
  }
3814

3815
  if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) {
3816
    // The user explicitly turned off large pages.
3817
    // Ignore the rest of the large pages flags.
3818
    UseTransparentHugePages = false;
3819
    UseHugeTLBFS = false;
3820
    UseSHM = false;
3821
    return;
3822
  }
3823

3824
  // 2) Scan OS info
3825
  size_t default_large_page_size = scan_default_large_page_size();
3826
  os::Linux::_default_large_page_size = default_large_page_size;
3827
  if (default_large_page_size == 0) {
3828
    // No large pages configured, return.
3829
    warn_no_large_pages_configured();
3830
    UseLargePages = false;
3831
    UseTransparentHugePages = false;
3832
    UseHugeTLBFS = false;
3833
    UseSHM = false;
3834
    return;
3835
  }
3836
  os::PageSizes all_large_pages = scan_multiple_page_support();
3837

3838
  // 3) Consistency check and post-processing
3839

3840
  // It is unclear if /sys/kernel/mm/hugepages/ and /proc/meminfo could disagree. Manually
3841
  // re-add the default page size to the list of page sizes to be sure.
3842
  all_large_pages.add(default_large_page_size);
3843

3844
  // Check LargePageSizeInBytes matches an available page size and if so set _large_page_size
3845
  // using LargePageSizeInBytes as the maximum allowed large page size. If LargePageSizeInBytes
3846
  // doesn't match an available page size set _large_page_size to default_large_page_size
3847
  // and use it as the maximum.
3848
 if (FLAG_IS_DEFAULT(LargePageSizeInBytes) ||
3849
      LargePageSizeInBytes == 0 ||
3850
      LargePageSizeInBytes == default_large_page_size) {
3851
    _large_page_size = default_large_page_size;
3852
    log_info(pagesize)("Using the default large page size: " SIZE_FORMAT "%s",
3853
                       byte_size_in_exact_unit(_large_page_size),
3854
                       exact_unit_for_byte_size(_large_page_size));
3855
  } else {
3856
    if (all_large_pages.contains(LargePageSizeInBytes)) {
3857
      _large_page_size = LargePageSizeInBytes;
3858
      log_info(pagesize)("Overriding default large page size (" SIZE_FORMAT "%s) "
3859
                         "using LargePageSizeInBytes: " SIZE_FORMAT "%s",
3860
                         byte_size_in_exact_unit(default_large_page_size),
3861
                         exact_unit_for_byte_size(default_large_page_size),
3862
                         byte_size_in_exact_unit(_large_page_size),
3863
                         exact_unit_for_byte_size(_large_page_size));
3864
    } else {
3865
      _large_page_size = default_large_page_size;
3866
      log_info(pagesize)("LargePageSizeInBytes is not a valid large page size (" SIZE_FORMAT "%s) "
3867
                         "using the default large page size: " SIZE_FORMAT "%s",
3868
                         byte_size_in_exact_unit(LargePageSizeInBytes),
3869
                         exact_unit_for_byte_size(LargePageSizeInBytes),
3870
                         byte_size_in_exact_unit(_large_page_size),
3871
                         exact_unit_for_byte_size(_large_page_size));
3872
    }
3873
  }
3874

3875
  // Populate _page_sizes with large page sizes less than or equal to
3876
  // _large_page_size.
3877
  for (size_t page_size = _large_page_size; page_size != 0;
3878
         page_size = all_large_pages.next_smaller(page_size)) {
3879
    _page_sizes.add(page_size);
3880
  }
3881

3882
  LogTarget(Info, pagesize) lt;
3883
  if (lt.is_enabled()) {
3884
    LogStream ls(lt);
3885
    ls.print("Usable page sizes: ");
3886
    _page_sizes.print_on(&ls);
3887
  }
3888

3889
  // Now determine the type of large pages to use:
3890
  UseLargePages = os::Linux::setup_large_page_type(_large_page_size);
3891

3892
  set_coredump_filter(LARGEPAGES_BIT);
3893
}
3894

3895
#ifndef SHM_HUGETLB
3896
  #define SHM_HUGETLB 04000
3897
#endif
3898

3899
#ifndef __ANDROID__
3900

3901
#define shm_warning_format(format, ...)              \
3902
  do {                                               \
3903
    if (UseLargePages &&                             \
3904
        (!FLAG_IS_DEFAULT(UseLargePages) ||          \
3905
         !FLAG_IS_DEFAULT(UseSHM) ||                 \
3906
         !FLAG_IS_DEFAULT(LargePageSizeInBytes))) {  \
3907
      warning(format, __VA_ARGS__);                  \
3908
    }                                                \
3909
  } while (0)
3910

3911
#define shm_warning(str) shm_warning_format("%s", str)
3912

3913
#define shm_warning_with_errno(str)                \
3914
  do {                                             \
3915
    int err = errno;                               \
3916
    shm_warning_format(str " (error = %d)", err);  \
3917
  } while (0)
3918

3919
static char* shmat_with_alignment(int shmid, size_t bytes, size_t alignment) {
3920
  assert(is_aligned(bytes, alignment), "Must be divisible by the alignment");
3921

3922
  if (!is_aligned(alignment, SHMLBA)) {
3923
    assert(false, "Code below assumes that alignment is at least SHMLBA aligned");
3924
    return NULL;
3925
  }
3926

3927
  // To ensure that we get 'alignment' aligned memory from shmat,
3928
  // we pre-reserve aligned virtual memory and then attach to that.
3929

3930
  char* pre_reserved_addr = anon_mmap_aligned(NULL /* req_addr */, bytes, alignment);
3931
  if (pre_reserved_addr == NULL) {
3932
    // Couldn't pre-reserve aligned memory.
3933
    shm_warning("Failed to pre-reserve aligned memory for shmat.");
3934
    return NULL;
3935
  }
3936

3937
  // SHM_REMAP is needed to allow shmat to map over an existing mapping.
3938
  char* addr = (char*)shmat(shmid, pre_reserved_addr, SHM_REMAP);
3939

3940
  if ((intptr_t)addr == -1) {
3941
    int err = errno;
3942
    shm_warning_with_errno("Failed to attach shared memory.");
3943

3944
    assert(err != EACCES, "Unexpected error");
3945
    assert(err != EIDRM,  "Unexpected error");
3946
    assert(err != EINVAL, "Unexpected error");
3947

3948
    // Since we don't know if the kernel unmapped the pre-reserved memory area
3949
    // we can't unmap it, since that would potentially unmap memory that was
3950
    // mapped from other threads.
3951
    return NULL;
3952
  }
3953

3954
  return addr;
3955
}
3956

3957
static char* shmat_at_address(int shmid, char* req_addr) {
3958
  if (!is_aligned(req_addr, SHMLBA)) {
3959
    assert(false, "Requested address needs to be SHMLBA aligned");
3960
    return NULL;
3961
  }
3962

3963
  char* addr = (char*)shmat(shmid, req_addr, 0);
3964

3965
  if ((intptr_t)addr == -1) {
3966
    shm_warning_with_errno("Failed to attach shared memory.");
3967
    return NULL;
3968
  }
3969

3970
  return addr;
3971
}
3972

3973
static char* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) {
3974
  // If a req_addr has been provided, we assume that the caller has already aligned the address.
3975
  if (req_addr != NULL) {
3976
    assert(is_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
3977
    assert(is_aligned(req_addr, alignment), "Must be divisible by given alignment");
3978
    return shmat_at_address(shmid, req_addr);
3979
  }
3980

3981
  // Since shmid has been setup with SHM_HUGETLB, shmat will automatically
3982
  // return large page size aligned memory addresses when req_addr == NULL.
3983
  // However, if the alignment is larger than the large page size, we have
3984
  // to manually ensure that the memory returned is 'alignment' aligned.
3985
  if (alignment > os::large_page_size()) {
3986
    assert(is_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size");
3987
    return shmat_with_alignment(shmid, bytes, alignment);
3988
  } else {
3989
    return shmat_at_address(shmid, NULL);
3990
  }
3991
}
3992

3993
#endif // !__ANDROID__
3994

3995
char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment,
3996
                                            char* req_addr, bool exec) {
3997
#ifndef __ANDROID__
3998
  // "exec" is passed in but not used.  Creating the shared image for
3999
  // the code cache doesn't have an SHM_X executable permission to check.
4000
  assert(UseLargePages && UseSHM, "only for SHM large pages");
4001
  assert(is_aligned(req_addr, os::large_page_size()), "Unaligned address");
4002
  assert(is_aligned(req_addr, alignment), "Unaligned address");
4003

4004
  if (!is_aligned(bytes, os::large_page_size())) {
4005
    return NULL; // Fallback to small pages.
4006
  }
4007

4008
  // Create a large shared memory region to attach to based on size.
4009
  // Currently, size is the total size of the heap.
4010
  int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
4011
  if (shmid == -1) {
4012
    // Possible reasons for shmget failure:
4013
    // 1. shmmax is too small for the request.
4014
    //    > check shmmax value: cat /proc/sys/kernel/shmmax
4015
    //    > increase shmmax value: echo "new_value" > /proc/sys/kernel/shmmax
4016
    // 2. not enough large page memory.
4017
    //    > check available large pages: cat /proc/meminfo
4018
    //    > increase amount of large pages:
4019
    //          sysctl -w vm.nr_hugepages=new_value
4020
    //    > For more information regarding large pages please refer to:
4021
    //      https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
4022
    //      Note 1: different Linux may use different name for this property,
4023
    //            e.g. on Redhat AS-3 it is "hugetlb_pool".
4024
    //      Note 2: it's possible there's enough physical memory available but
4025
    //            they are so fragmented after a long run that they can't
4026
    //            coalesce into large pages. Try to reserve large pages when
4027
    //            the system is still "fresh".
4028
    shm_warning_with_errno("Failed to reserve shared memory.");
4029
    return NULL;
4030
  }
4031

4032
  // Attach to the region.
4033
  char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
4034

4035
  // Remove shmid. If shmat() is successful, the actual shared memory segment
4036
  // will be deleted when it's detached by shmdt() or when the process
4037
  // terminates. If shmat() is not successful this will remove the shared
4038
  // segment immediately.
4039
  shmctl(shmid, IPC_RMID, NULL);
4040

4041
  return addr;
4042
#else
4043
  assert(0, "SHM not supported on this platform");
4044
  return NULL;
4045
#endif // !__ANDROID__
4046
}
4047

4048
static void warn_on_commit_special_failure(char* req_addr, size_t bytes,
4049
                                           size_t page_size, int error) {
4050
  assert(error == ENOMEM, "Only expect to fail if no memory is available");
4051

4052
  bool warn_on_failure = UseLargePages &&
4053
      (!FLAG_IS_DEFAULT(UseLargePages) ||
4054
       !FLAG_IS_DEFAULT(UseHugeTLBFS) ||
4055
       !FLAG_IS_DEFAULT(LargePageSizeInBytes));
4056

4057
  if (warn_on_failure) {
4058
    char msg[128];
4059
    jio_snprintf(msg, sizeof(msg), "Failed to reserve and commit memory. req_addr: "
4060
                                   PTR_FORMAT " bytes: " SIZE_FORMAT " page size: "
4061
                                   SIZE_FORMAT " (errno = %d).",
4062
                                   req_addr, bytes, page_size, error);
4063
    warning("%s", msg);
4064
  }
4065
}
4066

4067
bool os::Linux::commit_memory_special(size_t bytes,
4068
                                      size_t page_size,
4069
                                      char* req_addr,
4070
                                      bool exec) {
4071
  assert(UseLargePages && UseHugeTLBFS, "Should only get here when HugeTLBFS large pages are used");
4072
  assert(is_aligned(bytes, page_size), "Unaligned size");
4073
  assert(is_aligned(req_addr, page_size), "Unaligned address");
4074
  assert(req_addr != NULL, "Must have a requested address for special mappings");
4075

4076
  int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
4077
  int flags = MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED;
4078

4079
  // For large pages additional flags are required.
4080
  if (page_size > (size_t) os::vm_page_size()) {
4081
    flags |= MAP_HUGETLB | hugetlbfs_page_size_flag(page_size);
4082
  }
4083
  char* addr = (char*)::mmap(req_addr, bytes, prot, flags, -1, 0);
4084

4085
  if (addr == MAP_FAILED) {
4086
    warn_on_commit_special_failure(req_addr, bytes, page_size, errno);
4087
    return false;
4088
  }
4089

4090
  log_debug(pagesize)("Commit special mapping: " PTR_FORMAT ", size=" SIZE_FORMAT "%s, page size="
4091
                      SIZE_FORMAT "%s",
4092
                      p2i(addr), byte_size_in_exact_unit(bytes),
4093
                      exact_unit_for_byte_size(bytes),
4094
                      byte_size_in_exact_unit(page_size),
4095
                      exact_unit_for_byte_size(page_size));
4096
  assert(is_aligned(addr, page_size), "Must be");
4097
  return true;
4098
}
4099

4100
char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
4101
                                                   size_t alignment,
4102
                                                   size_t page_size,
4103
                                                   char* req_addr,
4104
                                                   bool exec) {
4105
  assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
4106
  assert(is_aligned(req_addr, alignment), "Must be");
4107
  assert(is_aligned(req_addr, page_size), "Must be");
4108
  assert(is_aligned(alignment, os::vm_allocation_granularity()), "Must be");
4109
  assert(_page_sizes.contains(page_size), "Must be a valid page size");
4110
  assert(page_size > (size_t)os::vm_page_size(), "Must be a large page size");
4111
  assert(bytes >= page_size, "Shouldn't allocate large pages for small sizes");
4112

4113
  // We only end up here when at least 1 large page can be used.
4114
  // If the size is not a multiple of the large page size, we
4115
  // will mix the type of pages used, but in a decending order.
4116
  // Start off by reserving a range of the given size that is
4117
  // properly aligned. At this point no pages are committed. If
4118
  // a requested address is given it will be used and it must be
4119
  // aligned to both the large page size and the given alignment.
4120
  // The larger of the two will be used.
4121
  size_t required_alignment = MAX(page_size, alignment);
4122
  char* const aligned_start = anon_mmap_aligned(req_addr, bytes, required_alignment);
4123
  if (aligned_start == NULL) {
4124
    return NULL;
4125
  }
4126

4127
  // First commit using large pages.
4128
  size_t large_bytes = align_down(bytes, page_size);
4129
  bool large_committed = commit_memory_special(large_bytes, page_size, aligned_start, exec);
4130

4131
  if (large_committed && bytes == large_bytes) {
4132
    // The size was large page aligned so no additional work is
4133
    // needed even if the commit failed.
4134
    return aligned_start;
4135
  }
4136

4137
  // The requested size requires some small pages as well.
4138
  char* small_start = aligned_start + large_bytes;
4139
  size_t small_size = bytes - large_bytes;
4140
  if (!large_committed) {
4141
    // Failed to commit large pages, so we need to unmap the
4142
    // reminder of the orinal reservation.
4143
    ::munmap(small_start, small_size);
4144
    return NULL;
4145
  }
4146

4147
  // Commit the remaining bytes using small pages.
4148
  bool small_committed = commit_memory_special(small_size, os::vm_page_size(), small_start, exec);
4149
  if (!small_committed) {
4150
    // Failed to commit the remaining size, need to unmap
4151
    // the large pages part of the reservation.
4152
    ::munmap(aligned_start, large_bytes);
4153
    return NULL;
4154
  }
4155
  return aligned_start;
4156
}
4157

4158
char* os::pd_reserve_memory_special(size_t bytes, size_t alignment, size_t page_size,
4159
                                    char* req_addr, bool exec) {
4160
  assert(UseLargePages, "only for large pages");
4161

4162
  char* addr;
4163
  if (UseSHM) {
4164
    // No support for using specific page sizes with SHM.
4165
    addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
4166
  } else {
4167
    assert(UseHugeTLBFS, "must be");
4168
    addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, page_size, req_addr, exec);
4169
  }
4170

4171
  if (addr != NULL) {
4172
    if (UseNUMAInterleaving) {
4173
      numa_make_global(addr, bytes);
4174
    }
4175
  }
4176

4177
  return addr;
4178
}
4179

4180
bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
4181
#ifndef __ANDROID__
4182
  // detaching the SHM segment will also delete it, see reserve_memory_special_shm()
4183
  return shmdt(base) == 0;
4184
#else
4185
  assert(0, "SHM not supported on this platform");
4186
  return false;
4187
#endif // SUPPORTS_SHM
4188
}
4189

4190
bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
4191
  return pd_release_memory(base, bytes);
4192
}
4193

4194
bool os::pd_release_memory_special(char* base, size_t bytes) {
4195
  assert(UseLargePages, "only for large pages");
4196
  bool res;
4197

4198
  if (UseSHM) {
4199
    res = os::Linux::release_memory_special_shm(base, bytes);
4200
  } else {
4201
    assert(UseHugeTLBFS, "must be");
4202
    res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
4203
  }
4204
  return res;
4205
}
4206

4207
size_t os::large_page_size() {
4208
  return _large_page_size;
4209
}
4210

4211
// With SysV SHM the entire memory region must be allocated as shared
4212
// memory.
4213
// HugeTLBFS allows application to commit large page memory on demand.
4214
// However, when committing memory with HugeTLBFS fails, the region
4215
// that was supposed to be committed will lose the old reservation
4216
// and allow other threads to steal that memory region. Because of this
4217
// behavior we can't commit HugeTLBFS memory.
4218
bool os::can_commit_large_page_memory() {
4219
  return UseTransparentHugePages;
4220
}
4221

4222
bool os::can_execute_large_page_memory() {
4223
  return UseTransparentHugePages || UseHugeTLBFS;
4224
}
4225

4226
char* os::pd_attempt_map_memory_to_file_at(char* requested_addr, size_t bytes, int file_desc) {
4227
  assert(file_desc >= 0, "file_desc is not valid");
4228
  char* result = pd_attempt_reserve_memory_at(requested_addr, bytes, !ExecMem);
4229
  if (result != NULL) {
4230
    if (replace_existing_mapping_with_file_mapping(result, bytes, file_desc) == NULL) {
4231
      vm_exit_during_initialization(err_msg("Error in mapping Java heap at the given filesystem directory"));
4232
    }
4233
  }
4234
  return result;
4235
}
4236

4237
// Reserve memory at an arbitrary address, only if that area is
4238
// available (and not reserved for something else).
4239

4240
char* os::pd_attempt_reserve_memory_at(char* requested_addr, size_t bytes, bool exec) {
4241
  // Assert only that the size is a multiple of the page size, since
4242
  // that's all that mmap requires, and since that's all we really know
4243
  // about at this low abstraction level.  If we need higher alignment,
4244
  // we can either pass an alignment to this method or verify alignment
4245
  // in one of the methods further up the call chain.  See bug 5044738.
4246
  assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
4247

4248
  // Repeatedly allocate blocks until the block is allocated at the
4249
  // right spot.
4250

4251
  // Linux mmap allows caller to pass an address as hint; give it a try first,
4252
  // if kernel honors the hint then we can return immediately.
4253
  char * addr = anon_mmap(requested_addr, bytes);
4254
  if (addr == requested_addr) {
4255
    return requested_addr;
4256
  }
4257

4258
  if (addr != NULL) {
4259
    // mmap() is successful but it fails to reserve at the requested address
4260
    anon_munmap(addr, bytes);
4261
  }
4262

4263
  return NULL;
4264
}
4265

4266
// Sleep forever; naked call to OS-specific sleep; use with CAUTION
4267
void os::infinite_sleep() {
4268
  while (true) {    // sleep forever ...
4269
    ::sleep(100);   // ... 100 seconds at a time
4270
  }
4271
}
4272

4273
// Used to convert frequent JVM_Yield() to nops
4274
bool os::dont_yield() {
4275
  return DontYieldALot;
4276
}
4277

4278
// Linux CFS scheduler (since 2.6.23) does not guarantee sched_yield(2) will
4279
// actually give up the CPU. Since skip buddy (v2.6.28):
4280
//
4281
// * Sets the yielding task as skip buddy for current CPU's run queue.
4282
// * Picks next from run queue, if empty, picks a skip buddy (can be the yielding task).
4283
// * Clears skip buddies for this run queue (yielding task no longer a skip buddy).
4284
//
4285
// An alternative is calling os::naked_short_nanosleep with a small number to avoid
4286
// getting re-scheduled immediately.
4287
//
4288
void os::naked_yield() {
4289
  sched_yield();
4290
}
4291

4292
////////////////////////////////////////////////////////////////////////////////
4293
// thread priority support
4294

4295
// Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER
4296
// only supports dynamic priority, static priority must be zero. For real-time
4297
// applications, Linux supports SCHED_RR which allows static priority (1-99).
4298
// However, for large multi-threaded applications, SCHED_RR is not only slower
4299
// than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out
4300
// of 5 runs - Sep 2005).
4301
//
4302
// The following code actually changes the niceness of kernel-thread/LWP. It
4303
// has an assumption that setpriority() only modifies one kernel-thread/LWP,
4304
// not the entire user process, and user level threads are 1:1 mapped to kernel
4305
// threads. It has always been the case, but could change in the future. For
4306
// this reason, the code should not be used as default (ThreadPriorityPolicy=0).
4307
// It is only used when ThreadPriorityPolicy=1 and may require system level permission
4308
// (e.g., root privilege or CAP_SYS_NICE capability).
4309

4310
int os::java_to_os_priority[CriticalPriority + 1] = {
4311
  19,              // 0 Entry should never be used
4312

4313
   4,              // 1 MinPriority
4314
   3,              // 2
4315
   2,              // 3
4316

4317
   1,              // 4
4318
   0,              // 5 NormPriority
4319
  -1,              // 6
4320

4321
  -2,              // 7
4322
  -3,              // 8
4323
  -4,              // 9 NearMaxPriority
4324

4325
  -5,              // 10 MaxPriority
4326

4327
  -5               // 11 CriticalPriority
4328
};
4329

4330
static int prio_init() {
4331
  if (ThreadPriorityPolicy == 1) {
4332
    if (geteuid() != 0) {
4333
      if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy) && !FLAG_IS_JIMAGE_RESOURCE(ThreadPriorityPolicy)) {
4334
        warning("-XX:ThreadPriorityPolicy=1 may require system level permission, " \
4335
                "e.g., being the root user. If the necessary permission is not " \
4336
                "possessed, changes to priority will be silently ignored.");
4337
      }
4338
    }
4339
  }
4340
  if (UseCriticalJavaThreadPriority) {
4341
    os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
4342
  }
4343
  return 0;
4344
}
4345

4346
OSReturn os::set_native_priority(Thread* thread, int newpri) {
4347
  if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
4348

4349
  int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri);
4350
  return (ret == 0) ? OS_OK : OS_ERR;
4351
}
4352

4353
OSReturn os::get_native_priority(const Thread* const thread,
4354
                                 int *priority_ptr) {
4355
  if (!UseThreadPriorities || ThreadPriorityPolicy == 0) {
4356
    *priority_ptr = java_to_os_priority[NormPriority];
4357
    return OS_OK;
4358
  }
4359

4360
  errno = 0;
4361
  *priority_ptr = getpriority(PRIO_PROCESS, thread->osthread()->thread_id());
4362
  return (*priority_ptr != -1 || errno == 0 ? OS_OK : OS_ERR);
4363
}
4364

4365
// This is the fastest way to get thread cpu time on Linux.
4366
// Returns cpu time (user+sys) for any thread, not only for current.
4367
// POSIX compliant clocks are implemented in the kernels 2.6.16+.
4368
// It might work on 2.6.10+ with a special kernel/glibc patch.
4369
// For reference, please, see IEEE Std 1003.1-2004:
4370
//   http://www.unix.org/single_unix_specification
4371

4372
jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
4373
  struct timespec tp;
4374
  int status = clock_gettime(clockid, &tp);
4375
  assert(status == 0, "clock_gettime error: %s", os::strerror(errno));
4376
  return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
4377
}
4378

4379
// Determine if the vmid is the parent pid for a child in a PID namespace.
4380
// Return the namespace pid if so, otherwise -1.
4381
int os::Linux::get_namespace_pid(int vmid) {
4382
  char fname[24];
4383
  int retpid = -1;
4384

4385
  snprintf(fname, sizeof(fname), "/proc/%d/status", vmid);
4386
  FILE *fp = fopen(fname, "r");
4387

4388
  if (fp) {
4389
    int pid, nspid;
4390
    int ret;
4391
    while (!feof(fp) && !ferror(fp)) {
4392
      ret = fscanf(fp, "NSpid: %d %d", &pid, &nspid);
4393
      if (ret == 1) {
4394
        break;
4395
      }
4396
      if (ret == 2) {
4397
        retpid = nspid;
4398
        break;
4399
      }
4400
      for (;;) {
4401
        int ch = fgetc(fp);
4402
        if (ch == EOF || ch == (int)'\n') break;
4403
      }
4404
    }
4405
    fclose(fp);
4406
  }
4407
  return retpid;
4408
}
4409

4410
extern void report_error(char* file_name, int line_no, char* title,
4411
                         char* format, ...);
4412

4413
// Some linux distributions (notably: Alpine Linux) include the
4414
// grsecurity in the kernel. Of particular interest from a JVM perspective
4415
// is PaX (https://pax.grsecurity.net/), which adds some security features
4416
// related to page attributes. Specifically, the MPROTECT PaX functionality
4417
// (https://pax.grsecurity.net/docs/mprotect.txt) prevents dynamic
4418
// code generation by disallowing a (previously) writable page to be
4419
// marked as executable. This is, of course, exactly what HotSpot does
4420
// for both JIT compiled method, as well as for stubs, adapters, etc.
4421
//
4422
// Instead of crashing "lazily" when trying to make a page executable,
4423
// this code probes for the presence of PaX and reports the failure
4424
// eagerly.
4425
static void check_pax(void) {
4426
  // Zero doesn't generate code dynamically, so no need to perform the PaX check
4427
#ifndef ZERO
4428
  size_t size = os::Linux::page_size();
4429

4430
  void* p = ::mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
4431
  if (p == MAP_FAILED) {
4432
    log_debug(os)("os_linux.cpp: check_pax: mmap failed (%s)" , os::strerror(errno));
4433
    vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "failed to allocate memory for PaX check.");
4434
  }
4435

4436
  int res = ::mprotect(p, size, PROT_WRITE|PROT_EXEC);
4437
  if (res == -1) {
4438
    log_debug(os)("os_linux.cpp: check_pax: mprotect failed (%s)" , os::strerror(errno));
4439
    vm_exit_during_initialization(
4440
      "Failed to mark memory page as executable - check if grsecurity/PaX is enabled");
4441
  }
4442

4443
  ::munmap(p, size);
4444
#endif
4445
}
4446

4447
// this is called _before_ most of the global arguments have been parsed
4448
void os::init(void) {
4449
  char dummy;   // used to get a guess on initial stack address
4450

4451
  clock_tics_per_sec = sysconf(_SC_CLK_TCK);
4452

4453
  Linux::set_page_size(sysconf(_SC_PAGESIZE));
4454
  if (Linux::page_size() == -1) {
4455
    fatal("os_linux.cpp: os::init: sysconf failed (%s)",
4456
          os::strerror(errno));
4457
  }
4458
  _page_sizes.add(Linux::page_size());
4459

4460
  Linux::initialize_system_info();
4461

4462
#ifdef __GLIBC__
4463
  Linux::_mallinfo = CAST_TO_FN_PTR(Linux::mallinfo_func_t, dlsym(RTLD_DEFAULT, "mallinfo"));
4464
  Linux::_mallinfo2 = CAST_TO_FN_PTR(Linux::mallinfo2_func_t, dlsym(RTLD_DEFAULT, "mallinfo2"));
4465
#endif // __GLIBC__
4466

4467
  os::Linux::CPUPerfTicks pticks;
4468
  bool res = os::Linux::get_tick_information(&pticks, -1);
4469

4470
  if (res && pticks.has_steal_ticks) {
4471
    has_initial_tick_info = true;
4472
    initial_total_ticks = pticks.total;
4473
    initial_steal_ticks = pticks.steal;
4474
  }
4475

4476
  // _main_thread points to the thread that created/loaded the JVM.
4477
  Linux::_main_thread = pthread_self();
4478

4479
  // retrieve entry point for pthread_setname_np
4480
  Linux::_pthread_setname_np =
4481
    (int(*)(pthread_t, const char*))dlsym(RTLD_DEFAULT, "pthread_setname_np");
4482

4483
  check_pax();
4484

4485
  os::Posix::init();
4486

4487
  initial_time_count = javaTimeNanos();
4488
}
4489

4490
// To install functions for atexit system call
4491
extern "C" {
4492
  static void perfMemory_exit_helper() {
4493
    perfMemory_exit();
4494
  }
4495
}
4496

4497
void os::pd_init_container_support() {
4498
  OSContainer::init();
4499
}
4500

4501
void os::Linux::numa_init() {
4502

4503
  // Java can be invoked as
4504
  // 1. Without numactl and heap will be allocated/configured on all nodes as
4505
  //    per the system policy.
4506
  // 2. With numactl --interleave:
4507
  //      Use numa_get_interleave_mask(v2) API to get nodes bitmask. The same
4508
  //      API for membind case bitmask is reset.
4509
  //      Interleave is only hint and Kernel can fallback to other nodes if
4510
  //      no memory is available on the target nodes.
4511
  // 3. With numactl --membind:
4512
  //      Use numa_get_membind(v2) API to get nodes bitmask. The same API for
4513
  //      interleave case returns bitmask of all nodes.
4514
  // numa_all_nodes_ptr holds bitmask of all nodes.
4515
  // numa_get_interleave_mask(v2) and numa_get_membind(v2) APIs returns correct
4516
  // bitmask when externally configured to run on all or fewer nodes.
4517

4518
  if (!Linux::libnuma_init()) {
4519
    FLAG_SET_ERGO(UseNUMA, false);
4520
    FLAG_SET_ERGO(UseNUMAInterleaving, false); // Also depends on libnuma.
4521
  } else {
4522
    if ((Linux::numa_max_node() < 1) || Linux::is_bound_to_single_node()) {
4523
      // If there's only one node (they start from 0) or if the process
4524
      // is bound explicitly to a single node using membind, disable NUMA
4525
      UseNUMA = false;
4526
    } else {
4527
      LogTarget(Info,os) log;
4528
      LogStream ls(log);
4529

4530
      Linux::set_configured_numa_policy(Linux::identify_numa_policy());
4531

4532
      struct bitmask* bmp = Linux::_numa_membind_bitmask;
4533
      const char* numa_mode = "membind";
4534

4535
      if (Linux::is_running_in_interleave_mode()) {
4536
        bmp = Linux::_numa_interleave_bitmask;
4537
        numa_mode = "interleave";
4538
      }
4539

4540
      ls.print("UseNUMA is enabled and invoked in '%s' mode."
4541
               " Heap will be configured using NUMA memory nodes:", numa_mode);
4542

4543
      for (int node = 0; node <= Linux::numa_max_node(); node++) {
4544
        if (Linux::_numa_bitmask_isbitset(bmp, node)) {
4545
          ls.print(" %d", node);
4546
        }
4547
      }
4548
    }
4549
  }
4550

4551
  // When NUMA requested, not-NUMA-aware allocations default to interleaving.
4552
  if (UseNUMA && !UseNUMAInterleaving) {
4553
    FLAG_SET_ERGO_IF_DEFAULT(UseNUMAInterleaving, true);
4554
  }
4555

4556
  if (UseParallelGC && UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
4557
    // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
4558
    // we can make the adaptive lgrp chunk resizing work. If the user specified both
4559
    // UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn
4560
    // and disable adaptive resizing.
4561
    if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
4562
      warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, "
4563
              "disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
4564
      UseAdaptiveSizePolicy = false;
4565
      UseAdaptiveNUMAChunkSizing = false;
4566
    }
4567
  }
4568
}
4569

4570
// this is called _after_ the global arguments have been parsed
4571
jint os::init_2(void) {
4572

4573
  // This could be set after os::Posix::init() but all platforms
4574
  // have to set it the same so we have to mirror Solaris.
4575
  DEBUG_ONLY(os::set_mutex_init_done();)
4576

4577
  os::Posix::init_2();
4578

4579
  Linux::fast_thread_clock_init();
4580

4581
  if (PosixSignals::init() == JNI_ERR) {
4582
    return JNI_ERR;
4583
  }
4584

4585
  if (AdjustStackSizeForTLS) {
4586
    get_minstack_init();
4587
  }
4588

4589
  // Check and sets minimum stack sizes against command line options
4590
  if (Posix::set_minimum_stack_sizes() == JNI_ERR) {
4591
    return JNI_ERR;
4592
  }
4593

4594
#if defined(IA32) && !defined(ZERO)
4595
  // Need to ensure we've determined the process's initial stack to
4596
  // perform the workaround
4597
  Linux::capture_initial_stack(JavaThread::stack_size_at_create());
4598
  workaround_expand_exec_shield_cs_limit();
4599
#else
4600
  suppress_primordial_thread_resolution = Arguments::created_by_java_launcher();
4601
  if (!suppress_primordial_thread_resolution) {
4602
    Linux::capture_initial_stack(JavaThread::stack_size_at_create());
4603
  }
4604
#endif
4605

4606
  Linux::libpthread_init();
4607
  Linux::sched_getcpu_init();
4608
  log_info(os)("HotSpot is running with %s, %s",
4609
               Linux::libc_version(), Linux::libpthread_version());
4610

4611
  if (UseNUMA || UseNUMAInterleaving) {
4612
    Linux::numa_init();
4613
  }
4614

4615
  if (MaxFDLimit) {
4616
    // set the number of file descriptors to max. print out error
4617
    // if getrlimit/setrlimit fails but continue regardless.
4618
    struct rlimit nbr_files;
4619
    int status = getrlimit(RLIMIT_NOFILE, &nbr_files);
4620
    if (status != 0) {
4621
      log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
4622
    } else {
4623
      nbr_files.rlim_cur = nbr_files.rlim_max;
4624
      status = setrlimit(RLIMIT_NOFILE, &nbr_files);
4625
      if (status != 0) {
4626
        log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
4627
      }
4628
    }
4629
  }
4630

4631
  // at-exit methods are called in the reverse order of their registration.
4632
  // atexit functions are called on return from main or as a result of a
4633
  // call to exit(3C). There can be only 32 of these functions registered
4634
  // and atexit() does not set errno.
4635

4636
  if (PerfAllowAtExitRegistration) {
4637
    // only register atexit functions if PerfAllowAtExitRegistration is set.
4638
    // atexit functions can be delayed until process exit time, which
4639
    // can be problematic for embedded VM situations. Embedded VMs should
4640
    // call DestroyJavaVM() to assure that VM resources are released.
4641

4642
    // note: perfMemory_exit_helper atexit function may be removed in
4643
    // the future if the appropriate cleanup code can be added to the
4644
    // VM_Exit VMOperation's doit method.
4645
    if (atexit(perfMemory_exit_helper) != 0) {
4646
      warning("os::init_2 atexit(perfMemory_exit_helper) failed");
4647
    }
4648
  }
4649

4650
  // initialize thread priority policy
4651
  prio_init();
4652

4653
  if (!FLAG_IS_DEFAULT(AllocateHeapAt)) {
4654
    set_coredump_filter(DAX_SHARED_BIT);
4655
  }
4656

4657
  if (DumpPrivateMappingsInCore) {
4658
    set_coredump_filter(FILE_BACKED_PVT_BIT);
4659
  }
4660

4661
  if (DumpSharedMappingsInCore) {
4662
    set_coredump_filter(FILE_BACKED_SHARED_BIT);
4663
  }
4664

4665
  if (DumpPerfMapAtExit && FLAG_IS_DEFAULT(UseCodeCacheFlushing)) {
4666
    // Disable code cache flushing to ensure the map file written at
4667
    // exit contains all nmethods generated during execution.
4668
    FLAG_SET_DEFAULT(UseCodeCacheFlushing, false);
4669
  }
4670

4671
  return JNI_OK;
4672
}
4673

4674
// older glibc versions don't have this macro (which expands to
4675
// an optimized bit-counting function) so we have to roll our own
4676
#ifndef CPU_COUNT
4677

4678
static int _cpu_count(const cpu_set_t* cpus) {
4679
  int count = 0;
4680
  // only look up to the number of configured processors
4681
  for (int i = 0; i < os::processor_count(); i++) {
4682
    if (CPU_ISSET(i, cpus)) {
4683
      count++;
4684
    }
4685
  }
4686
  return count;
4687
}
4688

4689
#define CPU_COUNT(cpus) _cpu_count(cpus)
4690

4691
#endif // CPU_COUNT
4692

4693
// Get the current number of available processors for this process.
4694
// This value can change at any time during a process's lifetime.
4695
// sched_getaffinity gives an accurate answer as it accounts for cpusets.
4696
// If it appears there may be more than 1024 processors then we do a
4697
// dynamic check - see 6515172 for details.
4698
// If anything goes wrong we fallback to returning the number of online
4699
// processors - which can be greater than the number available to the process.
4700
int os::Linux::active_processor_count() {
4701
  cpu_set_t cpus;  // can represent at most 1024 (CPU_SETSIZE) processors
4702
  cpu_set_t* cpus_p = &cpus;
4703
  int cpus_size = sizeof(cpu_set_t);
4704

4705
  int configured_cpus = os::processor_count();  // upper bound on available cpus
4706
  int cpu_count = 0;
4707

4708
// old build platforms may not support dynamic cpu sets
4709
#ifdef CPU_ALLOC
4710

4711
  // To enable easy testing of the dynamic path on different platforms we
4712
  // introduce a diagnostic flag: UseCpuAllocPath
4713
  if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) {
4714
    // kernel may use a mask bigger than cpu_set_t
4715
    log_trace(os)("active_processor_count: using dynamic path %s"
4716
                  "- configured processors: %d",
4717
                  UseCpuAllocPath ? "(forced) " : "",
4718
                  configured_cpus);
4719
    cpus_p = CPU_ALLOC(configured_cpus);
4720
    if (cpus_p != NULL) {
4721
      cpus_size = CPU_ALLOC_SIZE(configured_cpus);
4722
      // zero it just to be safe
4723
      CPU_ZERO_S(cpus_size, cpus_p);
4724
    }
4725
    else {
4726
       // failed to allocate so fallback to online cpus
4727
       int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
4728
       log_trace(os)("active_processor_count: "
4729
                     "CPU_ALLOC failed (%s) - using "
4730
                     "online processor count: %d",
4731
                     os::strerror(errno), online_cpus);
4732
       return online_cpus;
4733
    }
4734
  }
4735
  else {
4736
    log_trace(os)("active_processor_count: using static path - configured processors: %d",
4737
                  configured_cpus);
4738
  }
4739
#else // CPU_ALLOC
4740
// these stubs won't be executed
4741
#define CPU_COUNT_S(size, cpus) -1
4742
#define CPU_FREE(cpus)
4743

4744
  log_trace(os)("active_processor_count: only static path available - configured processors: %d",
4745
                configured_cpus);
4746
#endif // CPU_ALLOC
4747

4748
  // pid 0 means the current thread - which we have to assume represents the process
4749
  if (sched_getaffinity(0, cpus_size, cpus_p) == 0) {
4750
    if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
4751
      cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
4752
    }
4753
    else {
4754
      cpu_count = CPU_COUNT(cpus_p);
4755
    }
4756
    log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
4757
  }
4758
  else {
4759
    cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
4760
    warning("sched_getaffinity failed (%s)- using online processor count (%d) "
4761
            "which may exceed available processors", os::strerror(errno), cpu_count);
4762
  }
4763

4764
  if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
4765
    CPU_FREE(cpus_p);
4766
  }
4767

4768
  assert(cpu_count > 0 && cpu_count <= os::processor_count(), "sanity check");
4769
  return cpu_count;
4770
}
4771

4772
// Determine the active processor count from one of
4773
// three different sources:
4774
//
4775
// 1. User option -XX:ActiveProcessorCount
4776
// 2. kernel os calls (sched_getaffinity or sysconf(_SC_NPROCESSORS_ONLN)
4777
// 3. extracted from cgroup cpu subsystem (shares and quotas)
4778
//
4779
// Option 1, if specified, will always override.
4780
// If the cgroup subsystem is active and configured, we
4781
// will return the min of the cgroup and option 2 results.
4782
// This is required since tools, such as numactl, that
4783
// alter cpu affinity do not update cgroup subsystem
4784
// cpuset configuration files.
4785
int os::active_processor_count() {
4786
  // User has overridden the number of active processors
4787
  if (ActiveProcessorCount > 0) {
4788
    log_trace(os)("active_processor_count: "
4789
                  "active processor count set by user : %d",
4790
                  ActiveProcessorCount);
4791
    return ActiveProcessorCount;
4792
  }
4793

4794
  int active_cpus;
4795
  if (OSContainer::is_containerized()) {
4796
    active_cpus = OSContainer::active_processor_count();
4797
    log_trace(os)("active_processor_count: determined by OSContainer: %d",
4798
                   active_cpus);
4799
  } else {
4800
    active_cpus = os::Linux::active_processor_count();
4801
  }
4802

4803
  return active_cpus;
4804
}
4805

4806
static bool should_warn_invalid_processor_id() {
4807
  if (os::processor_count() == 1) {
4808
    // Don't warn if we only have one processor
4809
    return false;
4810
  }
4811

4812
  static volatile int warn_once = 1;
4813

4814
  if (Atomic::load(&warn_once) == 0 ||
4815
      Atomic::xchg(&warn_once, 0) == 0) {
4816
    // Don't warn more than once
4817
    return false;
4818
  }
4819

4820
  return true;
4821
}
4822

4823
uint os::processor_id() {
4824
  const int id = Linux::sched_getcpu();
4825

4826
  if (id < processor_count()) {
4827
    return (uint)id;
4828
  }
4829

4830
  // Some environments (e.g. openvz containers and the rr debugger) incorrectly
4831
  // report a processor id that is higher than the number of processors available.
4832
  // This is problematic, for example, when implementing CPU-local data structures,
4833
  // where the processor id is used to index into an array of length processor_count().
4834
  // If this happens we return 0 here. This is is safe since we always have at least
4835
  // one processor, but it's not optimal for performance if we're actually executing
4836
  // in an environment with more than one processor.
4837
  if (should_warn_invalid_processor_id()) {
4838
    log_warning(os)("Invalid processor id reported by the operating system "
4839
                    "(got processor id %d, valid processor id range is 0-%d)",
4840
                    id, processor_count() - 1);
4841
    log_warning(os)("Falling back to assuming processor id is 0. "
4842
                    "This could have a negative impact on performance.");
4843
  }
4844

4845
  return 0;
4846
}
4847

4848
void os::set_native_thread_name(const char *name) {
4849
  if (Linux::_pthread_setname_np) {
4850
    char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
4851
    snprintf(buf, sizeof(buf), "%s", name);
4852
    buf[sizeof(buf) - 1] = '\0';
4853
    const int rc = Linux::_pthread_setname_np(pthread_self(), buf);
4854
    // ERANGE should not happen; all other errors should just be ignored.
4855
    assert(rc != ERANGE, "pthread_setname_np failed");
4856
  }
4857
}
4858

4859
bool os::bind_to_processor(uint processor_id) {
4860
  // Not yet implemented.
4861
  return false;
4862
}
4863

4864
////////////////////////////////////////////////////////////////////////////////
4865
// debug support
4866

4867
bool os::find(address addr, outputStream* st) {
4868
  Dl_info dlinfo;
4869
  memset(&dlinfo, 0, sizeof(dlinfo));
4870
  if (dladdr(addr, &dlinfo) != 0) {
4871
    st->print(PTR_FORMAT ": ", p2i(addr));
4872
    if (dlinfo.dli_sname != NULL && dlinfo.dli_saddr != NULL) {
4873
      st->print("%s+" PTR_FORMAT, dlinfo.dli_sname,
4874
                p2i(addr) - p2i(dlinfo.dli_saddr));
4875
    } else if (dlinfo.dli_fbase != NULL) {
4876
      st->print("<offset " PTR_FORMAT ">", p2i(addr) - p2i(dlinfo.dli_fbase));
4877
    } else {
4878
      st->print("<absolute address>");
4879
    }
4880
    if (dlinfo.dli_fname != NULL) {
4881
      st->print(" in %s", dlinfo.dli_fname);
4882
    }
4883
    if (dlinfo.dli_fbase != NULL) {
4884
      st->print(" at " PTR_FORMAT, p2i(dlinfo.dli_fbase));
4885
    }
4886
    st->cr();
4887

4888
    if (Verbose) {
4889
      // decode some bytes around the PC
4890
      address begin = clamp_address_in_page(addr-40, addr, os::vm_page_size());
4891
      address end   = clamp_address_in_page(addr+40, addr, os::vm_page_size());
4892
      address       lowest = (address) dlinfo.dli_sname;
4893
      if (!lowest)  lowest = (address) dlinfo.dli_fbase;
4894
      if (begin < lowest)  begin = lowest;
4895
      Dl_info dlinfo2;
4896
      if (dladdr(end, &dlinfo2) != 0 && dlinfo2.dli_saddr != dlinfo.dli_saddr
4897
          && end > dlinfo2.dli_saddr && dlinfo2.dli_saddr > begin) {
4898
        end = (address) dlinfo2.dli_saddr;
4899
      }
4900
      Disassembler::decode(begin, end, st);
4901
    }
4902
    return true;
4903
  }
4904
  return false;
4905
}
4906

4907
////////////////////////////////////////////////////////////////////////////////
4908
// misc
4909

4910
// This does not do anything on Linux. This is basically a hook for being
4911
// able to use structured exception handling (thread-local exception filters)
4912
// on, e.g., Win32.
4913
void
4914
os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
4915
                         JavaCallArguments* args, JavaThread* thread) {
4916
  f(value, method, args, thread);
4917
}
4918

4919
void os::print_statistics() {
4920
}
4921

4922
bool os::message_box(const char* title, const char* message) {
4923
  int i;
4924
  fdStream err(defaultStream::error_fd());
4925
  for (i = 0; i < 78; i++) err.print_raw("=");
4926
  err.cr();
4927
  err.print_raw_cr(title);
4928
  for (i = 0; i < 78; i++) err.print_raw("-");
4929
  err.cr();
4930
  err.print_raw_cr(message);
4931
  for (i = 0; i < 78; i++) err.print_raw("=");
4932
  err.cr();
4933

4934
  char buf[16];
4935
  // Prevent process from exiting upon "read error" without consuming all CPU
4936
  while (::read(0, buf, sizeof(buf)) <= 0) { ::sleep(100); }
4937

4938
  return buf[0] == 'y' || buf[0] == 'Y';
4939
}
4940

4941
// Is a (classpath) directory empty?
4942
bool os::dir_is_empty(const char* path) {
4943
  DIR *dir = NULL;
4944
  struct dirent *ptr;
4945

4946
  dir = opendir(path);
4947
  if (dir == NULL) return true;
4948

4949
  // Scan the directory
4950
  bool result = true;
4951
  while (result && (ptr = readdir(dir)) != NULL) {
4952
    if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
4953
      result = false;
4954
    }
4955
  }
4956
  closedir(dir);
4957
  return result;
4958
}
4959

4960
// This code originates from JDK's sysOpen and open64_w
4961
// from src/solaris/hpi/src/system_md.c
4962

4963
int os::open(const char *path, int oflag, int mode) {
4964
  if (strlen(path) > MAX_PATH - 1) {
4965
    errno = ENAMETOOLONG;
4966
    return -1;
4967
  }
4968

4969
  // All file descriptors that are opened in the Java process and not
4970
  // specifically destined for a subprocess should have the close-on-exec
4971
  // flag set.  If we don't set it, then careless 3rd party native code
4972
  // might fork and exec without closing all appropriate file descriptors
4973
  // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in
4974
  // turn might:
4975
  //
4976
  // - cause end-of-file to fail to be detected on some file
4977
  //   descriptors, resulting in mysterious hangs, or
4978
  //
4979
  // - might cause an fopen in the subprocess to fail on a system
4980
  //   suffering from bug 1085341.
4981
  //
4982
  // (Yes, the default setting of the close-on-exec flag is a Unix
4983
  // design flaw)
4984
  //
4985
  // See:
4986
  // 1085341: 32-bit stdio routines should support file descriptors >255
4987
  // 4843136: (process) pipe file descriptor from Runtime.exec not being closed
4988
  // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9
4989
  //
4990
  // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open().
4991
  // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor
4992
  // because it saves a system call and removes a small window where the flag
4993
  // is unset.  On ancient Linux kernels the O_CLOEXEC flag will be ignored
4994
  // and we fall back to using FD_CLOEXEC (see below).
4995
#ifdef O_CLOEXEC
4996
  oflag |= O_CLOEXEC;
4997
#endif
4998

4999
  int fd = ::open64(path, oflag, mode);
5000
  if (fd == -1) return -1;
5001

5002
  //If the open succeeded, the file might still be a directory
5003
  {
5004
    struct stat64 buf64;
5005
    int ret = ::fstat64(fd, &buf64);
5006
    int st_mode = buf64.st_mode;
5007

5008
    if (ret != -1) {
5009
      if ((st_mode & S_IFMT) == S_IFDIR) {
5010
        errno = EISDIR;
5011
        ::close(fd);
5012
        return -1;
5013
      }
5014
    } else {
5015
      ::close(fd);
5016
      return -1;
5017
    }
5018
  }
5019

5020
#ifdef FD_CLOEXEC
5021
  // Validate that the use of the O_CLOEXEC flag on open above worked.
5022
  // With recent kernels, we will perform this check exactly once.
5023
  static sig_atomic_t O_CLOEXEC_is_known_to_work = 0;
5024
  if (!O_CLOEXEC_is_known_to_work) {
5025
    int flags = ::fcntl(fd, F_GETFD);
5026
    if (flags != -1) {
5027
      if ((flags & FD_CLOEXEC) != 0)
5028
        O_CLOEXEC_is_known_to_work = 1;
5029
      else
5030
        ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
5031
    }
5032
  }
5033
#endif
5034

5035
  return fd;
5036
}
5037

5038

5039
// create binary file, rewriting existing file if required
5040
int os::create_binary_file(const char* path, bool rewrite_existing) {
5041
  int oflags = O_WRONLY | O_CREAT;
5042
  if (!rewrite_existing) {
5043
    oflags |= O_EXCL;
5044
  }
5045
  return ::open64(path, oflags, S_IREAD | S_IWRITE);
5046
}
5047

5048
// return current position of file pointer
5049
jlong os::current_file_offset(int fd) {
5050
  return (jlong)::lseek64(fd, (off64_t)0, SEEK_CUR);
5051
}
5052

5053
// move file pointer to the specified offset
5054
jlong os::seek_to_file_offset(int fd, jlong offset) {
5055
  return (jlong)::lseek64(fd, (off64_t)offset, SEEK_SET);
5056
}
5057

5058
// This code originates from JDK's sysAvailable
5059
// from src/solaris/hpi/src/native_threads/src/sys_api_td.c
5060

5061
int os::available(int fd, jlong *bytes) {
5062
  jlong cur, end;
5063
  int mode;
5064
  struct stat64 buf64;
5065

5066
  if (::fstat64(fd, &buf64) >= 0) {
5067
    mode = buf64.st_mode;
5068
    if (S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
5069
      int n;
5070
      if (::ioctl(fd, FIONREAD, &n) >= 0) {
5071
        *bytes = n;
5072
        return 1;
5073
      }
5074
    }
5075
  }
5076
  if ((cur = ::lseek64(fd, 0L, SEEK_CUR)) == -1) {
5077
    return 0;
5078
  } else if ((end = ::lseek64(fd, 0L, SEEK_END)) == -1) {
5079
    return 0;
5080
  } else if (::lseek64(fd, cur, SEEK_SET) == -1) {
5081
    return 0;
5082
  }
5083
  *bytes = end - cur;
5084
  return 1;
5085
}
5086

5087
// Map a block of memory.
5088
char* os::pd_map_memory(int fd, const char* file_name, size_t file_offset,
5089
                        char *addr, size_t bytes, bool read_only,
5090
                        bool allow_exec) {
5091
  int prot;
5092
  int flags = MAP_PRIVATE;
5093

5094
  if (read_only) {
5095
    prot = PROT_READ;
5096
  } else {
5097
    prot = PROT_READ | PROT_WRITE;
5098
  }
5099

5100
  if (allow_exec) {
5101
    prot |= PROT_EXEC;
5102
  }
5103

5104
  if (addr != NULL) {
5105
    flags |= MAP_FIXED;
5106
  }
5107

5108
  char* mapped_address = (char*)mmap(addr, (size_t)bytes, prot, flags,
5109
                                     fd, file_offset);
5110
  if (mapped_address == MAP_FAILED) {
5111
    return NULL;
5112
  }
5113
  return mapped_address;
5114
}
5115

5116

5117
// Remap a block of memory.
5118
char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
5119
                          char *addr, size_t bytes, bool read_only,
5120
                          bool allow_exec) {
5121
  // same as map_memory() on this OS
5122
  return os::map_memory(fd, file_name, file_offset, addr, bytes, read_only,
5123
                        allow_exec);
5124
}
5125

5126

5127
// Unmap a block of memory.
5128
bool os::pd_unmap_memory(char* addr, size_t bytes) {
5129
  return munmap(addr, bytes) == 0;
5130
}
5131

5132
static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
5133

5134
static jlong fast_cpu_time(Thread *thread) {
5135
    clockid_t clockid;
5136
    int rc = os::Linux::pthread_getcpuclockid(thread->osthread()->pthread_id(),
5137
                                              &clockid);
5138
    if (rc == 0) {
5139
      return os::Linux::fast_thread_cpu_time(clockid);
5140
    } else {
5141
      // It's possible to encounter a terminated native thread that failed
5142
      // to detach itself from the VM - which should result in ESRCH.
5143
      assert_status(rc == ESRCH, rc, "pthread_getcpuclockid failed");
5144
      return -1;
5145
    }
5146
}
5147

5148
// current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool)
5149
// are used by JVM M&M and JVMTI to get user+sys or user CPU time
5150
// of a thread.
5151
//
5152
// current_thread_cpu_time() and thread_cpu_time(Thread*) returns
5153
// the fast estimate available on the platform.
5154

5155
jlong os::current_thread_cpu_time() {
5156
  if (os::Linux::supports_fast_thread_cpu_time()) {
5157
    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5158
  } else {
5159
    // return user + sys since the cost is the same
5160
    return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
5161
  }
5162
}
5163

5164
jlong os::thread_cpu_time(Thread* thread) {
5165
  // consistent with what current_thread_cpu_time() returns
5166
  if (os::Linux::supports_fast_thread_cpu_time()) {
5167
    return fast_cpu_time(thread);
5168
  } else {
5169
    return slow_thread_cpu_time(thread, true /* user + sys */);
5170
  }
5171
}
5172

5173
jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
5174
  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5175
    return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5176
  } else {
5177
    return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
5178
  }
5179
}
5180

5181
jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5182
  if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5183
    return fast_cpu_time(thread);
5184
  } else {
5185
    return slow_thread_cpu_time(thread, user_sys_cpu_time);
5186
  }
5187
}
5188

5189
//  -1 on error.
5190
static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5191
  pid_t  tid = thread->osthread()->thread_id();
5192
  char *s;
5193
  char stat[2048];
5194
  int statlen;
5195
  char proc_name[64];
5196
  int count;
5197
  long sys_time, user_time;
5198
  char cdummy;
5199
  int idummy;
5200
  long ldummy;
5201
  FILE *fp;
5202

5203
  snprintf(proc_name, 64, "/proc/self/task/%d/stat", tid);
5204
  fp = fopen(proc_name, "r");
5205
  if (fp == NULL) return -1;
5206
  statlen = fread(stat, 1, 2047, fp);
5207
  stat[statlen] = '\0';
5208
  fclose(fp);
5209

5210
  // Skip pid and the command string. Note that we could be dealing with
5211
  // weird command names, e.g. user could decide to rename java launcher
5212
  // to "java 1.4.2 :)", then the stat file would look like
5213
  //                1234 (java 1.4.2 :)) R ... ...
5214
  // We don't really need to know the command string, just find the last
5215
  // occurrence of ")" and then start parsing from there. See bug 4726580.
5216
  s = strrchr(stat, ')');
5217
  if (s == NULL) return -1;
5218

5219
  // Skip blank chars
5220
  do { s++; } while (s && isspace(*s));
5221

5222
  count = sscanf(s,"%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
5223
                 &cdummy, &idummy, &idummy, &idummy, &idummy, &idummy,
5224
                 &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
5225
                 &user_time, &sys_time);
5226
  if (count != 13) return -1;
5227
  if (user_sys_cpu_time) {
5228
    return ((jlong)sys_time + (jlong)user_time) * (1000000000 / clock_tics_per_sec);
5229
  } else {
5230
    return (jlong)user_time * (1000000000 / clock_tics_per_sec);
5231
  }
5232
}
5233

5234
void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5235
  info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5236
  info_ptr->may_skip_backward = false;     // elapsed time not wall time
5237
  info_ptr->may_skip_forward = false;      // elapsed time not wall time
5238
  info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5239
}
5240

5241
void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5242
  info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5243
  info_ptr->may_skip_backward = false;     // elapsed time not wall time
5244
  info_ptr->may_skip_forward = false;      // elapsed time not wall time
5245
  info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5246
}
5247

5248
bool os::is_thread_cpu_time_supported() {
5249
  return true;
5250
}
5251

5252
// System loadavg support.  Returns -1 if load average cannot be obtained.
5253
// Linux doesn't yet have a (official) notion of processor sets,
5254
// so just return the system wide load average.
5255
int os::loadavg(double loadavg[], int nelem) {
5256
#ifndef __ANDROID__
5257
  return ::getloadavg(loadavg, nelem);
5258
#else
5259
/*
5260
 * Copyright (C) 2018 The Android Open Source Project
5261
 * All rights reserved.
5262
 *
5263
 * Redistribution and use in source and binary forms, with or without
5264
 * modification, are permitted provided that the following conditions
5265
 * are met:
5266
 *  * Redistributions of source code must retain the above copyright
5267
 *    notice, this list of conditions and the following disclaimer.
5268
 *  * Redistributions in binary form must reproduce the above copyright
5269
 *    notice, this list of conditions and the following disclaimer in
5270
 *    the documentation and/or other materials provided with the
5271
 *    distribution.
5272
 *
5273
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
5274
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
5275
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
5276
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
5277
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
5278
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
5279
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
5280
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
5281
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
5282
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
5283
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5284
 * SUCH DAMAGE.
5285
 */
5286

5287
  if (nelem < 0) return -1;
5288
  if (nelem > 3) nelem = 3;
5289
  struct sysinfo si;
5290
  if (sysinfo(&si) == -1) return -1;
5291
  for (int i = 0; i < nelem; ++i) {
5292
    loadavg[i] = static_cast<double>(si.loads[i]) / static_cast<double>(1 << SI_LOAD_SHIFT);
5293
  }
5294
  return nelem;
5295
#endif
5296
}
5297

5298
void os::pause() {
5299
  char filename[MAX_PATH];
5300
  if (PauseAtStartupFile && PauseAtStartupFile[0]) {
5301
    jio_snprintf(filename, MAX_PATH, "%s", PauseAtStartupFile);
5302
  } else {
5303
    jio_snprintf(filename, MAX_PATH, "./vm.paused.%d", current_process_id());
5304
  }
5305

5306
  int fd = ::open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
5307
  if (fd != -1) {
5308
    struct stat buf;
5309
    ::close(fd);
5310
    while (::stat(filename, &buf) == 0) {
5311
      (void)::poll(NULL, 0, 100);
5312
    }
5313
  } else {
5314
    jio_fprintf(stderr,
5315
                "Could not open pause file '%s', continuing immediately.\n", filename);
5316
  }
5317
}
5318

5319
// Get the default path to the core file
5320
// Returns the length of the string
5321
int os::get_core_path(char* buffer, size_t bufferSize) {
5322
  /*
5323
   * Max length of /proc/sys/kernel/core_pattern is 128 characters.
5324
   * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
5325
   */
5326
  const int core_pattern_len = 129;
5327
  char core_pattern[core_pattern_len] = {0};
5328

5329
  int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY);
5330
  if (core_pattern_file == -1) {
5331
    return -1;
5332
  }
5333

5334
  ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
5335
  ::close(core_pattern_file);
5336
  if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') {
5337
    return -1;
5338
  }
5339
  if (core_pattern[ret-1] == '\n') {
5340
    core_pattern[ret-1] = '\0';
5341
  } else {
5342
    core_pattern[ret] = '\0';
5343
  }
5344

5345
  // Replace the %p in the core pattern with the process id. NOTE: we do this
5346
  // only if the pattern doesn't start with "|", and we support only one %p in
5347
  // the pattern.
5348
  char *pid_pos = strstr(core_pattern, "%p");
5349
  const char* tail = (pid_pos != NULL) ? (pid_pos + 2) : "";  // skip over the "%p"
5350
  int written;
5351

5352
  if (core_pattern[0] == '/') {
5353
    if (pid_pos != NULL) {
5354
      *pid_pos = '\0';
5355
      written = jio_snprintf(buffer, bufferSize, "%s%d%s", core_pattern,
5356
                             current_process_id(), tail);
5357
    } else {
5358
      written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
5359
    }
5360
  } else {
5361
    char cwd[PATH_MAX];
5362

5363
    const char* p = get_current_directory(cwd, PATH_MAX);
5364
    if (p == NULL) {
5365
      return -1;
5366
    }
5367

5368
    if (core_pattern[0] == '|') {
5369
      written = jio_snprintf(buffer, bufferSize,
5370
                             "\"%s\" (or dumping to %s/core.%d)",
5371
                             &core_pattern[1], p, current_process_id());
5372
    } else if (pid_pos != NULL) {
5373
      *pid_pos = '\0';
5374
      written = jio_snprintf(buffer, bufferSize, "%s/%s%d%s", p, core_pattern,
5375
                             current_process_id(), tail);
5376
    } else {
5377
      written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
5378
    }
5379
  }
5380

5381
  if (written < 0) {
5382
    return -1;
5383
  }
5384

5385
  if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) {
5386
    int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
5387

5388
    if (core_uses_pid_file != -1) {
5389
      char core_uses_pid = 0;
5390
      ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
5391
      ::close(core_uses_pid_file);
5392

5393
      if (core_uses_pid == '1') {
5394
        jio_snprintf(buffer + written, bufferSize - written,
5395
                                          ".%d", current_process_id());
5396
      }
5397
    }
5398
  }
5399

5400
  return strlen(buffer);
5401
}
5402

5403
bool os::start_debugging(char *buf, int buflen) {
5404
  int len = (int)strlen(buf);
5405
  char *p = &buf[len];
5406

5407
  jio_snprintf(p, buflen-len,
5408
               "\n\n"
5409
               "Do you want to debug the problem?\n\n"
5410
               "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n"
5411
               "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n"
5412
               "Otherwise, press RETURN to abort...",
5413
               os::current_process_id(), os::current_process_id(),
5414
               os::current_thread_id(), os::current_thread_id());
5415

5416
  bool yes = os::message_box("Unexpected Error", buf);
5417

5418
  if (yes) {
5419
    // yes, user asked VM to launch debugger
5420
    jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
5421
                 os::current_process_id(), os::current_process_id());
5422

5423
    os::fork_and_exec(buf);
5424
    yes = false;
5425
  }
5426
  return yes;
5427
}
5428

5429

5430
// Java/Compiler thread:
5431
//
5432
//   Low memory addresses
5433
// P0 +------------------------+
5434
//    |                        |\  Java thread created by VM does not have glibc
5435
//    |    glibc guard page    | - guard page, attached Java thread usually has
5436
//    |                        |/  1 glibc guard page.
5437
// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
5438
//    |                        |\
5439
//    |  HotSpot Guard Pages   | - red, yellow and reserved pages
5440
//    |                        |/
5441
//    +------------------------+ StackOverflow::stack_reserved_zone_base()
5442
//    |                        |\
5443
//    |      Normal Stack      | -
5444
//    |                        |/
5445
// P2 +------------------------+ Thread::stack_base()
5446
//
5447
// Non-Java thread:
5448
//
5449
//   Low memory addresses
5450
// P0 +------------------------+
5451
//    |                        |\
5452
//    |  glibc guard page      | - usually 1 page
5453
//    |                        |/
5454
// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
5455
//    |                        |\
5456
//    |      Normal Stack      | -
5457
//    |                        |/
5458
// P2 +------------------------+ Thread::stack_base()
5459
//
5460
// ** P1 (aka bottom) and size (P2 = P1 - size) are the address and stack size
5461
//    returned from pthread_attr_getstack().
5462
// ** Due to NPTL implementation error, linux takes the glibc guard page out
5463
//    of the stack size given in pthread_attr. We work around this for
5464
//    threads created by the VM. (We adapt bottom to be P1 and size accordingly.)
5465
//
5466
#ifndef ZERO
5467
static void current_stack_region(address * bottom, size_t * size) {
5468
  if (os::is_primordial_thread()) {
5469
    // primordial thread needs special handling because pthread_getattr_np()
5470
    // may return bogus value.
5471
    *bottom = os::Linux::initial_thread_stack_bottom();
5472
    *size   = os::Linux::initial_thread_stack_size();
5473
  } else {
5474
    pthread_attr_t attr;
5475

5476
    int rslt = pthread_getattr_np(pthread_self(), &attr);
5477

5478
    // JVM needs to know exact stack location, abort if it fails
5479
    if (rslt != 0) {
5480
      if (rslt == ENOMEM) {
5481
        vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
5482
      } else {
5483
        fatal("pthread_getattr_np failed with error = %d", rslt);
5484
      }
5485
    }
5486

5487
    if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
5488
      fatal("Cannot locate current stack attributes!");
5489
    }
5490

5491
    // Work around NPTL stack guard error.
5492
    size_t guard_size = 0;
5493
    rslt = pthread_attr_getguardsize(&attr, &guard_size);
5494
    if (rslt != 0) {
5495
      fatal("pthread_attr_getguardsize failed with error = %d", rslt);
5496
    }
5497
    *bottom += guard_size;
5498
    *size   -= guard_size;
5499

5500
    pthread_attr_destroy(&attr);
5501

5502
  }
5503
  assert(os::current_stack_pointer() >= *bottom &&
5504
         os::current_stack_pointer() < *bottom + *size, "just checking");
5505
}
5506

5507
address os::current_stack_base() {
5508
  address bottom;
5509
  size_t size;
5510
  current_stack_region(&bottom, &size);
5511
  return (bottom + size);
5512
}
5513

5514
size_t os::current_stack_size() {
5515
  // This stack size includes the usable stack and HotSpot guard pages
5516
  // (for the threads that have Hotspot guard pages).
5517
  address bottom;
5518
  size_t size;
5519
  current_stack_region(&bottom, &size);
5520
  return size;
5521
}
5522
#endif
5523

5524
static inline struct timespec get_mtime(const char* filename) {
5525
  struct stat st;
5526
  int ret = os::stat(filename, &st);
5527
  assert(ret == 0, "failed to stat() file '%s': %s", filename, os::strerror(errno));
5528
  return st.st_mtim;
5529
}
5530

5531
int os::compare_file_modified_times(const char* file1, const char* file2) {
5532
  struct timespec filetime1 = get_mtime(file1);
5533
  struct timespec filetime2 = get_mtime(file2);
5534
  int diff = filetime1.tv_sec - filetime2.tv_sec;
5535
  if (diff == 0) {
5536
    return filetime1.tv_nsec - filetime2.tv_nsec;
5537
  }
5538
  return diff;
5539
}
5540

5541
bool os::supports_map_sync() {
5542
  return true;
5543
}
5544

5545
void os::print_memory_mappings(char* addr, size_t bytes, outputStream* st) {
5546
  unsigned long long start = (unsigned long long)addr;
5547
  unsigned long long end = start + bytes;
5548
  FILE* f = ::fopen("/proc/self/maps", "r");
5549
  int num_found = 0;
5550
  if (f != NULL) {
5551
    st->print("Range [%llx-%llx) contains: ", start, end);
5552
    char line[512];
5553
    while(fgets(line, sizeof(line), f) == line) {
5554
      unsigned long long a1 = 0;
5555
      unsigned long long a2 = 0;
5556
      if (::sscanf(line, "%llx-%llx", &a1, &a2) == 2) {
5557
        // Lets print out every range which touches ours.
5558
        if ((a1 >= start && a1 < end) || // left leg in
5559
            (a2 >= start && a2 < end) || // right leg in
5560
            (a1 < start && a2 >= end)) { // superimposition
5561
          num_found ++;
5562
          st->print("%s", line); // line includes \n
5563
        }
5564
      }
5565
    }
5566
    ::fclose(f);
5567
    if (num_found == 0) {
5568
      st->print("nothing.");
5569
    }
5570
    st->cr();
5571
  }
5572
}
5573

5574
Product

Resources

Company