CoCalc -- copy

GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/hotspot/cpu/s390/copy_s390.hpp
⁴⁰⁹³⁰ views
1
/*
2
 * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
3
 * Copyright (c) 2016, 2020 SAP SE. All rights reserved.
4
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
 *
6
 * This code is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License version 2 only, as
8
 * published by the Free Software Foundation.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 *
24
 */
25

26
// Major contributions by LS
27

28
#ifndef CPU_S390_COPY_S390_HPP
29
#define CPU_S390_COPY_S390_HPP
30

31
// Inline functions for memory copy and fill.
32

33
// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a
34
// pointer variable), since we always run the _LP64 model. As a consequence,
35
// HeapWord* memory ranges are always assumed to be doubleword-aligned,
36
// having a size which is an integer multiple of HeapWordSize.
37
//
38
// Dealing only with doubleword-aligned doubleword units has important
39
// positive performance and data access consequences. Many of the move
40
// instructions perform particularly well under these circumstances.
41
// Data access is "doubleword-concurrent", except for MVC and XC.
42
// Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
43
// by use of the special padding byte 0xb1, where required. For copying,
44
// we use padding byte 0xb0 to prevent the D-cache from being polluted.
45
//
46
// On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.
47
// This is optimal, even if just one HeapWord is copied. However, MVC
48
// copying is not atomic, i.e. not "doubleword concurrent" by definition.
49
//
50
// If the -mmvcle compiler option is specified, memcpy translates into
51
// code such that the entire memory range is copied or preset with just
52
// one MVCLE instruction.
53
//
54
// *to = *from is transformed into a MVC instruction already with -O1.
55
// Thus, for atomic copy operations, (inline) assembler code is required
56
// to guarantee atomic data accesses.
57
//
58
// For large (len >= MVCLEThreshold) chunks of memory, we exploit
59
// special H/W support of z/Architecture:
60
// 1) copy short piece of memory to page-align address(es)
61
// 2) copy largest part (all contained full pages) of memory using mvcle instruction.
62
//    z/Architecture processors have special H/W support for page-aligned storage
63
//    where len is an int multiple of page size. In that case, up to 4 cache lines are
64
//    processed in parallel and L1 cache is not polluted.
65
// 3) copy the remaining piece of memory.
66
//
67
//  Measurement classifications:
68
//  very rare - <=     10.000 calls AND <=     1.000 usec elapsed
69
//       rare - <=    100.000 calls AND <=    10.000 usec elapsed
70
//       some - <=  1.000.000 calls AND <=   100.000 usec elapsed
71
//       freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed
72
//  very freq - >  10.000.000 calls OR  >  1.000.000 usec elapsed
73

74
#undef USE_INLINE_ASM
75

76
static void copy_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {
77
  if (from > to) {
78
    while (count-- > 0) {
79
      // Copy forwards
80
      *to++ = *from++;
81
    }
82
  } else {
83
    from += count - 1;
84
    to   += count - 1;
85
    while (count-- > 0) {
86
      // Copy backwards
87
      *to-- = *from--;
88
    }
89
  }
90
}
91

92
static void copy_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {
93
  if (from > to) {
94
    while (count-- > 0) {
95
      // Copy forwards
96
      *to++ = *from++;
97
    }
98
  } else {
99
    from += count - 1;
100
    to   += count - 1;
101
    while (count-- > 0) {
102
      // Copy backwards
103
      *to-- = *from--;
104
    }
105
  }
106
}
107

108
static bool has_destructive_overlap(const char* from, char* to, size_t byte_count) {
109
  return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
110
}
111

112
#ifdef USE_INLINE_ASM
113

114
  //--------------------------------------------------------------
115
  // Atomic copying. Atomicity is given by the minimum of source
116
  // and target alignment. Refer to mail comm with Tim Slegel/IBM.
117
  // Only usable for disjoint source and target.
118
  //--------------------------------------------------------------
119
  #define MOVE8_ATOMIC_4(_to,_from) {                            \
120
    unsigned long toaddr;                                        \
121
    unsigned long fromaddr;                                      \
122
    asm(                                                         \
123
      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
124
      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
125
      "MVC     0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
126
      : [to]       "+Q"  (_to)          /* outputs   */          \
127
      , [from]     "+Q"  (_from)                                 \
128
      , [toaddr]   "=a"  (toaddr)                                \
129
      , [fromaddr] "=a"  (fromaddr)                              \
130
      :                                                          \
131
      : "cc"                            /* clobbered */          \
132
    );                                                           \
133
  }
134
  #define MOVE8_ATOMIC_3(_to,_from) {                            \
135
    unsigned long toaddr;                                        \
136
    unsigned long fromaddr;                                      \
137
    asm(                                                         \
138
      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
139
      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
140
      "MVC     0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
141
      : [to]       "+Q"  (_to)          /* outputs   */          \
142
      , [from]     "+Q"  (_from)                                 \
143
      , [toaddr]   "=a"  (toaddr)                                \
144
      , [fromaddr] "=a"  (fromaddr)                              \
145
      :                                                          \
146
      : "cc"                            /* clobbered */          \
147
    );                                                           \
148
  }
149
  #define MOVE8_ATOMIC_2(_to,_from) {                            \
150
    unsigned long toaddr;                                        \
151
    unsigned long fromaddr;                                      \
152
    asm(                                                         \
153
      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
154
      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
155
      "MVC     0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
156
      : [to]       "+Q"  (_to)          /* outputs   */          \
157
      , [from]     "+Q"  (_from)                                 \
158
      , [toaddr]   "=a"  (toaddr)                                \
159
      , [fromaddr] "=a"  (fromaddr)                              \
160
      :                                                          \
161
      : "cc"                            /* clobbered */          \
162
    );                                                           \
163
  }
164
  #define MOVE8_ATOMIC_1(_to,_from) {                            \
165
    unsigned long toaddr;                                        \
166
    unsigned long fromaddr;                                      \
167
    asm(                                                         \
168
      "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
169
      "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
170
      "MVC     0(8,%[toaddr]),0(%[fromaddr]) \n\t"  /* move data */ \
171
      : [to]       "+Q"  (_to)          /* outputs   */          \
172
      , [from]     "+Q"  (_from)                                 \
173
      , [toaddr]   "=a"  (toaddr)                                \
174
      , [fromaddr] "=a"  (fromaddr)                              \
175
      :                                                          \
176
      : "cc"                            /* clobbered */          \
177
    );                                                           \
178
  }
179

180
  //--------------------------------------------------------------
181
  // Atomic copying of 8-byte entities.
182
  // Conjoint/disjoint property does not matter. Entities are first
183
  // loaded and then stored.
184
  // _to and _from must be 8-byte aligned.
185
  //--------------------------------------------------------------
186
  #define COPY8_ATOMIC_4(_to,_from) {                            \
187
    unsigned long toaddr;                                        \
188
    asm(                                                         \
189
      "LG      3,%[from]        \n\t" /* address of from area */ \
190
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
191
      "LMG     0,3,0(3)         \n\t" /* load data            */ \
192
      "STMG    0,3,0(%[toaddr]) \n\t" /* store data           */ \
193
      : [to]     "+Q"  (_to)          /* outputs   */            \
194
      , [from]   "+Q"  (_from)        /* outputs   */            \
195
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
196
      :                                                          \
197
      : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
198
    );                                                           \
199
  }
200
  #define COPY8_ATOMIC_3(_to,_from) {                            \
201
    unsigned long toaddr;                                        \
202
    asm(                                                         \
203
      "LG      2,%[from]        \n\t" /* address of from area */ \
204
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
205
      "LMG     0,2,0(2)         \n\t" /* load data            */ \
206
      "STMG    0,2,0(%[toaddr]) \n\t" /* store data           */ \
207
      : [to]     "+Q"  (_to)          /* outputs   */            \
208
      , [from]   "+Q"  (_from)        /* outputs   */            \
209
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
210
      :                                                          \
211
      : "cc",  "r0", "r1", "r2"       /* clobbered */            \
212
    );                                                           \
213
  }
214
  #define COPY8_ATOMIC_2(_to,_from) {                            \
215
    unsigned long toaddr;                                        \
216
    asm(                                                         \
217
      "LG      1,%[from]        \n\t" /* address of from area */ \
218
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
219
      "LMG     0,1,0(1)         \n\t" /* load data            */ \
220
      "STMG    0,1,0(%[toaddr]) \n\t" /* store data           */ \
221
      : [to]     "+Q"  (_to)          /* outputs   */            \
222
      , [from]   "+Q"  (_from)        /* outputs   */            \
223
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
224
      :                                                          \
225
      : "cc",  "r0", "r1"             /* clobbered */            \
226
    );                                                           \
227
  }
228
  #define COPY8_ATOMIC_1(_to,_from) {                            \
229
    unsigned long addr;                                          \
230
    asm(                                                         \
231
      "LG      %[addr],%[from]  \n\t" /* address of from area */ \
232
      "LG      0,0(0,%[addr])   \n\t" /* load data            */ \
233
      "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
234
      "STG     0,0(0,%[addr])   \n\t" /* store data           */ \
235
      : [to]     "+Q"  (_to)          /* outputs   */            \
236
      , [from]   "+Q"  (_from)        /* outputs   */            \
237
      , [addr]   "=a"  (addr)         /* inputs    */            \
238
      :                                                          \
239
      : "cc",  "r0"                   /* clobbered */            \
240
    );                                                           \
241
  }
242

243
  //--------------------------------------------------------------
244
  // Atomic copying of 4-byte entities.
245
  // Exactly 4 (four) entities are copied.
246
  // Conjoint/disjoint property does not matter. Entities are first
247
  // loaded and then stored.
248
  // _to and _from must be 4-byte aligned.
249
  //--------------------------------------------------------------
250
  #define COPY4_ATOMIC_4(_to,_from) {                            \
251
    unsigned long toaddr;                                        \
252
    asm(                                                         \
253
      "LG      3,%[from]        \n\t" /* address of from area */ \
254
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
255
      "LM      0,3,0(3)         \n\t" /* load data            */ \
256
      "STM     0,3,0(%[toaddr]) \n\t" /* store data           */ \
257
      : [to]     "+Q"  (_to)          /* outputs   */            \
258
      , [from]   "+Q"  (_from)        /* outputs   */            \
259
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
260
      :                                                          \
261
      : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
262
    );                                                           \
263
  }
264
  #define COPY4_ATOMIC_3(_to,_from) {                            \
265
    unsigned long toaddr;                                        \
266
    asm(                                                         \
267
      "LG      2,%[from]        \n\t" /* address of from area */ \
268
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
269
      "LM      0,2,0(2)         \n\t" /* load data            */ \
270
      "STM     0,2,0(%[toaddr]) \n\t" /* store data           */ \
271
      : [to]     "+Q"  (_to)          /* outputs   */            \
272
      , [from]   "+Q"  (_from)        /* outputs   */            \
273
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
274
      :                                                          \
275
      : "cc",  "r0", "r1", "r2"       /* clobbered */            \
276
    );                                                           \
277
  }
278
  #define COPY4_ATOMIC_2(_to,_from) {                            \
279
    unsigned long toaddr;                                        \
280
    asm(                                                         \
281
      "LG      1,%[from]        \n\t" /* address of from area */ \
282
      "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
283
      "LM      0,1,0(1)         \n\t" /* load data            */ \
284
      "STM     0,1,0(%[toaddr]) \n\t" /* store data           */ \
285
      : [to]     "+Q"  (_to)          /* outputs   */            \
286
      , [from]   "+Q"  (_from)        /* outputs   */            \
287
      , [toaddr] "=a"  (toaddr)       /* inputs    */            \
288
      :                                                          \
289
      : "cc",  "r0", "r1"             /* clobbered */            \
290
    );                                                           \
291
  }
292
  #define COPY4_ATOMIC_1(_to,_from) {                            \
293
    unsigned long addr;                                          \
294
    asm(                                                         \
295
      "LG      %[addr],%[from]  \n\t" /* address of from area */ \
296
      "L       0,0(0,%[addr])   \n\t" /* load data            */ \
297
      "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
298
      "ST      0,0(0,%[addr])   \n\t" /* store data           */ \
299
      : [to]     "+Q"  (_to)          /* outputs   */            \
300
      , [from]   "+Q"  (_from)        /* outputs   */            \
301
      , [addr]   "=a"  (addr)         /* inputs    */            \
302
      :                                                          \
303
      : "cc",  "r0"                   /* clobbered */            \
304
    );                                                           \
305
  }
306

307
#if 0  // Waiting for gcc to support EXRL.
308
  #define MVC_MEMCOPY(_to,_from,_len)                                \
309
    if (VM_Version::has_ExecuteExtensions()) {                       \
310
      asm("\t"                                                       \
311
      "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
312
      "    EXRL    1,1f                \n\t" /* execute MVC instr */ \
313
      "    BRC     15,2f               \n\t" /* skip template */     \
314
      "1:  MVC     0(%[len],%[to]),0(%[from]) \n\t"                  \
315
      "2:  BCR     0,0                 \n\t"                         \
316
      : [to]   "+Q"  (_to)             /* outputs   */               \
317
      , [from] "+Q"  (_from)           /* outputs   */               \
318
      : [len]  "r"   (_len)            /* inputs    */               \
319
      : "cc",  "r1"                    /* clobbered */               \
320
      );                                                             \
321
    } else {                                                         \
322
      asm("\t"                                                       \
323
      "    LARL    2,3f                \n\t"                         \
324
      "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
325
      "    EX      1,0(2)              \n\t" /* execute MVC instr */ \
326
      "    BRC     15,4f               \n\t" /* skip template */     \
327
      "3:  MVC     0(%[len],%[to]),0(%[from])  \n\t"                 \
328
      "4:  BCR     0,0                 \n\t"                         \
329
      : [to]   "+Q"  (_to)             /* outputs   */               \
330
      , [from] "+Q"  (_from)           /* outputs   */               \
331
      : [len]  "r"   (_len)            /* inputs    */               \
332
      : "cc",  "r1", "r2"              /* clobbered */               \
333
      );                                                             \
334
    }
335
#else
336
  #define MVC_MEMCOPY(_to,_from,_len)                                \
337
  { unsigned long toaddr;   unsigned long tolen;                     \
338
    unsigned long fromaddr; unsigned long target;                    \
339
      asm("\t"                                                       \
340
      "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
341
      "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
342
      "    AGHI    %[tolen],-1         \n\t"                         \
343
      "    LG      %[toaddr],%[to]     \n\t"                         \
344
      "    LG      %[fromaddr],%[from] \n\t"                         \
345
      "    LARL    %[target],1f        \n\t" /* addr of MVC instr */ \
346
      "    EX      %[tolen],0(%[target])         \n\t" /* execute MVC instr */ \
347
      "    BRC     15,2f                         \n\t" /* skip template */     \
348
      "1:  MVC     0(1,%[toaddr]),0(%[fromaddr]) \n\t"                         \
349
      "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
350
      : [to]       "+Q"  (_to)         /* outputs   */               \
351
      , [from]     "+Q"  (_from)                                     \
352
      , [tolen]    "=a"  (tolen)                                     \
353
      , [toaddr]   "=a"  (toaddr)                                    \
354
      , [fromaddr] "=a"  (fromaddr)                                  \
355
      , [target]   "=a"  (target)                                    \
356
      : [len]       "r"  (_len)        /* inputs    */               \
357
      : "cc"                           /* clobbered */               \
358
      );                                                             \
359
  }
360
#endif
361

362
  #if 0  // code snippet to be used for debugging
363
      /* ASSERT code BEGIN */                                                \
364
      "    LARL    %[len],5f       \n\t"                                     \
365
      "    LARL    %[mta],4f       \n\t"                                     \
366
      "    SLGR    %[len],%[mta]   \n\t"                                     \
367
      "    CGHI    %[len],16       \n\t"                                     \
368
      "    BRC     7,9f            \n\t"      /* block size !=  16 */        \
369
                                                                             \
370
      "    LARL    %[len],1f       \n\t"                                     \
371
      "    SLGR    %[len],%[mta]   \n\t"                                     \
372
      "    CGHI    %[len],256      \n\t"                                     \
373
      "    BRC     7,9f            \n\t"      /* list len   != 256 */        \
374
                                                                             \
375
      "    LGR     0,0             \n\t"      /* artificial SIGILL */        \
376
      "9:  BRC     7,-2            \n\t"                                     \
377
      "    LARL    %[mta],1f       \n\t"      /* restore MVC table begin */  \
378
      /* ASSERT code END   */
379
  #endif
380

381
  // Optimized copying for data less than 4k
382
  // - no destructive overlap
383
  // - 0 <= _n_bytes <= 4096
384
  // This macro needs to be gcc-compiled with -march=z990. Otherwise, the
385
  // LAY instruction is not available.
386
  #define MVC_MULTI(_to,_from,_n_bytes)                                      \
387
  { unsigned long toaddr;                                                    \
388
    unsigned long fromaddr;                                                  \
389
    unsigned long movetable;                                                 \
390
    unsigned long len;                                                       \
391
      asm("\t"                                                               \
392
      "    LTGFR   %[len],%[nby]   \n\t"                                     \
393
      "    LG      %[ta],%[to]     \n\t"      /* address of to area   */     \
394
      "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
395
                                                                             \
396
      "    NILL    %[nby],255      \n\t"      /* # bytes mod 256      */     \
397
      "    LG      %[fa],%[from]   \n\t"      /* address of from area */     \
398
      "    BRC     8,3f            \n\t"      /* no rest, skip copying */    \
399
                                                                             \
400
      "    LARL    %[mta],2f       \n\t"      /* MVC template addr */        \
401
      "    AHI     %[nby],-1       \n\t"      /* adjust for EX MVC  */       \
402
                                                                             \
403
      "    EX      %[nby],0(%[mta]) \n\t"     /* only rightmost */           \
404
                                              /* 8 bits of nby used */       \
405
      /* Since nby is <= 4096 on entry to this code, we do need */           \
406
      /* no zero extension before using it in addr calc.        */           \
407
      "    LA      %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */         \
408
      "    LA      %[ta],1(%[nby],%[ta]) \n\t"/* adjust to   addr */         \
409
                                                                             \
410
      "3:  SRAG    %[nby],%[len],8 \n\t"      /* # cache lines     */        \
411
      "    LARL    %[mta],1f       \n\t"      /* MVC table begin   */        \
412
      "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
413
                                                                             \
414
      /* Insert ASSERT code here if required. */                             \
415
                                                                             \
416
                                                                             \
417
      "    LNGFR   %[nby],%[nby]   \n\t"      /* negative offset into     */ \
418
      "    SLLG    %[nby],%[nby],4 \n\t"      /* MVC table 16-byte blocks */ \
419
      "    BC      15,0(%[nby],%[mta]) \n\t"  /* branch to block #ncl  */    \
420
                                                                             \
421
      "2:  MVC     0(1,%[ta]),0(%[fa]) \n\t"  /* MVC template */             \
422
                                                                             \
423
      "4:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 4096 == l        */      \
424
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
425
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
426
      "5:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3840 <= l < 4096 */      \
427
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
428
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
429
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3548 <= l < 3328 */      \
430
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
431
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
432
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3328 <= l < 3328 */      \
433
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
434
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
435
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3072 <= l < 3328 */      \
436
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
437
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
438
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2816 <= l < 3072 */      \
439
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
440
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
441
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2560 <= l < 2816 */      \
442
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
443
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
444
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2304 <= l < 2560 */      \
445
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
446
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
447
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2048 <= l < 2304 */      \
448
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
449
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
450
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1792 <= l < 2048 */      \
451
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
452
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
453
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1536 <= l < 1792 */      \
454
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
455
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
456
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1280 <= l < 1536 */      \
457
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
458
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
459
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1024 <= l < 1280 */      \
460
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
461
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
462
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  768 <= l < 1024 */      \
463
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
464
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
465
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  512 <= l <  768 */      \
466
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
467
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
468
      "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  256 <= l <  512 */      \
469
      "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
470
      "    LA      %[fa],256(0,%[fa])      \n\t"                             \
471
      "1:  BCR     0,0                     \n\t" /* nop as branch target */  \
472
      : [to]       "+Q"  (_to)          /* outputs   */          \
473
      , [from]     "+Q"  (_from)                                 \
474
      , [ta]       "=a"  (toaddr)                                \
475
      , [fa]       "=a"  (fromaddr)                              \
476
      , [mta]      "=a"  (movetable)                             \
477
      , [nby]      "+a"  (_n_bytes)                              \
478
      , [len]      "=a"  (len)                                   \
479
      :                                                          \
480
      : "cc"                            /* clobbered */          \
481
    );                                                           \
482
  }
483

484
  #define MVCLE_MEMCOPY(_to,_from,_len)                           \
485
    asm(                                                          \
486
      "    LG      0,%[to]     \n\t"   /* address of to area   */ \
487
      "    LG      2,%[from]   \n\t"   /* address of from area */ \
488
      "    LGR     1,%[len]    \n\t"   /* len of to area       */ \
489
      "    LGR     3,%[len]    \n\t"   /* len of from area     */ \
490
      "1:  MVCLE   0,2,176     \n\t"   /* copy storage, bypass cache (0xb0) */ \
491
      "    BRC     1,1b        \n\t"   /* retry if interrupted */ \
492
      : [to]   "+Q"  (_to)             /* outputs   */            \
493
      , [from] "+Q"  (_from)           /* outputs   */            \
494
      : [len]  "r"   (_len)            /* inputs    */            \
495
      : "cc",  "r0", "r1", "r2", "r3"  /* clobbered */            \
496
    );
497

498
  #define MVCLE_MEMINIT(_to,_val,_len)                            \
499
    asm(                                                          \
500
      "    LG      0,%[to]       \n\t" /* address of to area   */ \
501
      "    LGR     1,%[len]      \n\t" /* len of to area       */ \
502
      "    XGR     3,3           \n\t" /* from area len = 0    */ \
503
      "1:  MVCLE   0,2,0(%[val]) \n\t" /* init storage         */ \
504
      "    BRC     1,1b          \n\t" /* retry if interrupted */ \
505
      : [to]   "+Q"  (_to)             /* outputs   */            \
506
      : [len]  "r"   (_len)            /* inputs    */            \
507
      , [val]  "r"   (_val)            /* inputs    */            \
508
      : "cc",  "r0", "r1", "r3"        /* clobbered */            \
509
    );
510
  #define MVCLE_MEMZERO(_to,_len)                                 \
511
    asm(                                                          \
512
      "    LG      0,%[to]       \n\t" /* address of to area   */ \
513
      "    LGR     1,%[len]      \n\t" /* len of to area       */ \
514
      "    XGR     3,3           \n\t" /* from area len = 0    */ \
515
      "1:  MVCLE   0,2,0         \n\t" /* clear storage        */ \
516
      "    BRC     1,1b          \n\t" /* retry if interrupted */ \
517
      : [to]   "+Q"  (_to)             /* outputs   */            \
518
      : [len]  "r"   (_len)            /* inputs    */            \
519
      : "cc",  "r0", "r1", "r3"        /* clobbered */            \
520
    );
521

522
  // Clear a stretch of memory, 0 <= _len <= 256.
523
  // There is no alignment prereq.
524
  // There is no test for len out of range specified above.
525
  #define XC_MEMZERO_256(_to,_len)                                 \
526
{ unsigned long toaddr;   unsigned long tolen;                     \
527
  unsigned long target;                                            \
528
    asm("\t"                                                       \
529
    "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
530
    "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
531
    "    AGHI    %[tolen],-1         \n\t" /* adjust for EX XC  */ \
532
    "    LARL    %[target],1f        \n\t" /* addr of XC instr  */ \
533
    "    LG      %[toaddr],%[to]     \n\t" /* addr of data area */ \
534
    "    EX      %[tolen],0(%[target])       \n\t" /* execute MVC instr */ \
535
    "    BRC     15,2f                       \n\t" /* skip template */     \
536
    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                         \
537
    "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
538
    : [to]       "+Q"  (_to)         /* outputs   */               \
539
    , [tolen]    "=a"  (tolen)                                     \
540
    , [toaddr]   "=a"  (toaddr)                                    \
541
    , [target]   "=a"  (target)                                    \
542
    : [len]       "r"  (_len)        /* inputs    */               \
543
    : "cc"                           /* clobbered */               \
544
    );                                                             \
545
}
546

547
  // Clear a stretch of memory, 256 < _len.
548
  // XC_MEMZERO_256 may be used to clear shorter areas.
549
  //
550
  // The code
551
  // - first zeroes a few bytes to align on a HeapWord.
552
  //   This step is currently inactive because all calls seem
553
  //   to have their data aligned on HeapWord boundaries.
554
  // - then zeroes a few HeapWords to align on a cache line.
555
  // - then zeroes entire cache lines in a loop.
556
  // - then zeroes the remaining (partial) cache line.
557
#if 1
558
  #define XC_MEMZERO_ANY(_to,_len)                                    \
559
{ unsigned long toaddr;   unsigned long tolen;                        \
560
  unsigned long len8;     unsigned long len256;                       \
561
  unsigned long target;   unsigned long lenx;                         \
562
    asm("\t"                                                          \
563
    "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
564
    "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
565
    "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
566
    "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
567
    " "                                                               \
568
    "    LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
569
    "    NILL    %[len256],0xff       \n\t"                           \
570
    "    BRC     8,4f                 \n\t" /* already aligned     */ \
571
    "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
572
    "    LLGFR   %[len256],%[len256]  \n\t"                           \
573
    "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
574
    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
575
    "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
576
    "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
577
    " "                                                               \
578
    "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
579
    "    BRC     8,6f                 \n\t" /* no full cache lines */ \
580
    "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
581
    "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
582
    "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
583
    " "                                                               \
584
    "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
585
    "    BRC     8,2f                 \n\t" /* done if none        */ \
586
    "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
587
    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
588
    "    BRC     15,2f                \n\t" /* skip template       */ \
589
    " "                                                               \
590
    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
591
    "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
592
    : [to]       "+Q"  (_to)         /* outputs   */               \
593
    , [lenx]     "=a"  (lenx)                                      \
594
    , [len256]   "=a"  (len256)                                    \
595
    , [tolen]    "=a"  (tolen)                                     \
596
    , [toaddr]   "=a"  (toaddr)                                    \
597
    , [target]   "=a"  (target)                                    \
598
    : [len]       "r"  (_len)        /* inputs    */               \
599
    : "cc"                           /* clobbered */               \
600
    );                                                             \
601
}
602
#else
603
  #define XC_MEMZERO_ANY(_to,_len)                                    \
604
{ unsigned long toaddr;   unsigned long tolen;                        \
605
  unsigned long len8;     unsigned long len256;                       \
606
  unsigned long target;   unsigned long lenx;                         \
607
    asm("\t"                                                          \
608
    "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
609
    "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
610
    "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
611
    "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
612
    " "                                                               \
613
    "    LCGR    %[len8],%[toaddr]    \n\t" /* HeapWord alignment  */ \
614
    "    NILL    %[len8],0x07         \n\t"                           \
615
    "    BRC     8,3f                 \n\t" /* already aligned     */ \
616
    "    NILH    %[len8],0x00         \n\t" /* zero extend         */ \
617
    "    LLGFR   %[len8],%[len8]      \n\t"                           \
618
    "    LAY     %[lenx],-1(,%[len8]) \n\t"                           \
619
    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr */   \
620
    "    LA      %[toaddr],0(%[len8],%[toaddr]) \n\t"                 \
621
    "    SGR     %[tolen],%[len8]     \n\t" /* adjust len          */ \
622
    " "                                                               \
623
    "3:  LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
624
    "    NILL    %[len256],0xff       \n\t"                           \
625
    "    BRC     8,4f                 \n\t" /* already aligned     */ \
626
    "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
627
    "    LLGFR   %[len256],%[len256]  \n\t"                           \
628
    "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
629
    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
630
    "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
631
    "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
632
    " "                                                               \
633
    "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
634
    "    BRC     8,6f                 \n\t" /* no full cache lines */ \
635
    "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
636
    "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
637
    "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
638
    " "                                                               \
639
    "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
640
    "    BRC     8,2f                 \n\t" /* done if none        */ \
641
    "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
642
    "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
643
    "    BRC     15,2f                \n\t" /* skip template       */ \
644
    " "                                                               \
645
    "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
646
    "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
647
    : [to]       "+Q"  (_to)         /* outputs   */               \
648
    , [lenx]     "=a"  (lenx)                                      \
649
    , [len8]     "=a"  (len8)                                      \
650
    , [len256]   "=a"  (len256)                                    \
651
    , [tolen]    "=a"  (tolen)                                     \
652
    , [toaddr]   "=a"  (toaddr)                                    \
653
    , [target]   "=a"  (target)                                    \
654
    : [len]       "r"  (_len)        /* inputs    */               \
655
    : "cc"                           /* clobbered */               \
656
    );                                                             \
657
}
658
#endif
659
#endif // USE_INLINE_ASM
660

661
//*************************************//
662
//   D I S J O I N T   C O P Y I N G   //
663
//*************************************//
664

665
static void pd_aligned_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
666
  // JVM2008: very frequent, some tests frequent.
667

668
  // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.
669
  // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands
670
  // are DW aligned and the length is an integer multiple of a DW. Should always be true here.
671
  //
672
  // No special exploit needed. H/W discovers suitable situations itself.
673
  //
674
  // For large chunks of memory, exploit special H/W support of z/Architecture:
675
  // 1) copy short piece of memory to page-align address(es)
676
  // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
677
  //    z/Architecture processors have special H/W support for page-aligned storage
678
  //    where len is an int multiple of page size. In that case, up to 4 cache lines are
679
  //    processed in parallel and L1 cache is not polluted.
680
  // 3) copy the remaining piece of memory.
681
  //
682
#ifdef USE_INLINE_ASM
683
  jbyte* to_bytes   = (jbyte*)to;
684
  jbyte* from_bytes = (jbyte*)from;
685
  size_t len_bytes  = count*HeapWordSize;
686

687
  // Optimized copying for data less than 4k
688
  switch (count) {
689
    case 0: return;
690
    case 1: MOVE8_ATOMIC_1(to,from)
691
            return;
692
    case 2: MOVE8_ATOMIC_2(to,from)
693
            return;
694
//  case 3: MOVE8_ATOMIC_3(to,from)
695
//          return;
696
//  case 4: MOVE8_ATOMIC_4(to,from)
697
//          return;
698
    default:
699
      if (len_bytes <= 4096) {
700
        MVC_MULTI(to,from,len_bytes)
701
        return;
702
      }
703
      // else
704
      MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
705
      return;
706
  }
707
#else
708
  // Fallback code.
709
  switch (count) {
710
    case 0:
711
      return;
712

713
    case 1:
714
      *to = *from;
715
      return;
716

717
    case 2:
718
      *to++ = *from++;
719
      *to = *from;
720
      return;
721

722
    case 3:
723
      *to++ = *from++;
724
      *to++ = *from++;
725
      *to = *from;
726
      return;
727

728
    case 4:
729
      *to++ = *from++;
730
      *to++ = *from++;
731
      *to++ = *from++;
732
      *to = *from;
733
      return;
734

735
    default:
736
      while (count-- > 0)
737
        *(to++) = *(from++);
738
      return;
739
  }
740
#endif
741
}
742

743
static void pd_disjoint_words_atomic(const HeapWord* from, HeapWord* to, size_t count) {
744
  // JVM2008: < 4k calls.
745
  assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
746
  pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
747
}
748

749
static void pd_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
750
  // JVM2008: very rare.
751
  pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
752
}
753

754

755
//*************************************//
756
//   C O N J O I N T   C O P Y I N G   //
757
//*************************************//
758

759
static void pd_aligned_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
760
  // JVM2008: between some and lower end of frequent.
761

762
#ifdef USE_INLINE_ASM
763
  size_t  count_in = count;
764
  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
765
    switch (count_in) {
766
      case 4: COPY8_ATOMIC_4(to,from)
767
              return;
768
      case 3: COPY8_ATOMIC_3(to,from)
769
              return;
770
      case 2: COPY8_ATOMIC_2(to,from)
771
              return;
772
      case 1: COPY8_ATOMIC_1(to,from)
773
              return;
774
      case 0: return;
775
      default:
776
        from += count_in;
777
        to   += count_in;
778
        while (count_in-- > 0)
779
          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
780
        return;
781
    }
782
  }
783
  // else
784
  jbyte* to_bytes   = (jbyte*)to;
785
  jbyte* from_bytes = (jbyte*)from;
786
  size_t len_bytes  = count_in*BytesPerLong;
787
  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
788
  return;
789
#else
790
  // Fallback code.
791
  if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
792
    HeapWord t1, t2, t3;
793
    switch (count) {
794
      case 0:
795
        return;
796

797
      case 1:
798
        *to = *from;
799
        return;
800

801
      case 2:
802
        t1 = *(from+1);
803
        *to = *from;
804
        *(to+1) = t1;
805
        return;
806

807
      case 3:
808
        t1 = *(from+1);
809
        t2 = *(from+2);
810
        *to = *from;
811
        *(to+1) = t1;
812
        *(to+2) = t2;
813
        return;
814

815
      case 4:
816
        t1 = *(from+1);
817
        t2 = *(from+2);
818
        t3 = *(from+3);
819
        *to = *from;
820
        *(to+1) = t1;
821
        *(to+2) = t2;
822
        *(to+3) = t3;
823
        return;
824

825
      default:
826
        from += count;
827
        to   += count;
828
        while (count-- > 0)
829
          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
830
        return;
831
    }
832
  }
833
  // else
834
  // Just delegate. HeapWords are optimally aligned anyway.
835
  pd_aligned_disjoint_words(from, to, count);
836
#endif
837
}
838

839
static void pd_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
840

841
  // Just delegate. HeapWords are optimally aligned anyway.
842
  pd_aligned_conjoint_words(from, to, count);
843
}
844

845
static void pd_conjoint_bytes(const void* from, void* to, size_t count) {
846

847
#ifdef USE_INLINE_ASM
848
  size_t count_in = count;
849
  if (has_destructive_overlap((char*)from, (char*)to, count_in))
850
    (void)memmove(to, from, count_in);
851
  else {
852
    jbyte*  to_bytes   = (jbyte*)to;
853
    jbyte*  from_bytes = (jbyte*)from;
854
    size_t  len_bytes  = count_in;
855
    MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
856
  }
857
#else
858
  if (has_destructive_overlap((char*)from, (char*)to, count))
859
    (void)memmove(to, from, count);
860
  else
861
    (void)memcpy(to, from, count);
862
#endif
863
}
864

865
//**************************************************//
866
//   C O N J O I N T  A T O M I C   C O P Y I N G   //
867
//**************************************************//
868

869
static void pd_conjoint_bytes_atomic(const void* from, void* to, size_t count) {
870
  // Call arraycopy stubs to do the job.
871
  pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
872
}
873

874
static void pd_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {
875

876
#ifdef USE_INLINE_ASM
877
  size_t count_in = count;
878
  if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerShort)) {
879
    // Use optimizations from shared code where no z-specific optimization exists.
880
    copy_conjoint_jshorts_atomic(from, to, count);
881
  } else {
882
    jbyte* to_bytes   = (jbyte*)to;
883
    jbyte* from_bytes = (jbyte*)from;
884
    size_t len_bytes  = count_in*BytesPerShort;
885
    MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
886
  }
887
#else
888
  // Use optimizations from shared code where no z-specific optimization exists.
889
  copy_conjoint_jshorts_atomic(from, to, count);
890
#endif
891
}
892

893
static void pd_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {
894

895
#ifdef USE_INLINE_ASM
896
  size_t count_in = count;
897
  if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerInt)) {
898
    switch (count_in) {
899
      case 4: COPY4_ATOMIC_4(to,from)
900
              return;
901
      case 3: COPY4_ATOMIC_3(to,from)
902
              return;
903
      case 2: COPY4_ATOMIC_2(to,from)
904
              return;
905
      case 1: COPY4_ATOMIC_1(to,from)
906
              return;
907
      case 0: return;
908
      default:
909
        // Use optimizations from shared code where no z-specific optimization exists.
910
        copy_conjoint_jints_atomic(from, to, count_in);
911
        return;
912
    }
913
  }
914
  // else
915
  jbyte* to_bytes   = (jbyte*)to;
916
  jbyte* from_bytes = (jbyte*)from;
917
  size_t len_bytes  = count_in*BytesPerInt;
918
  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
919
#else
920
  // Use optimizations from shared code where no z-specific optimization exists.
921
  copy_conjoint_jints_atomic(from, to, count);
922
#endif
923
}
924

925
static void pd_conjoint_jlongs_atomic(const jlong* from, jlong* to, size_t count) {
926

927
#ifdef USE_INLINE_ASM
928
  size_t count_in = count;
929
  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
930
    switch (count_in) {
931
      case 4: COPY8_ATOMIC_4(to,from) return;
932
      case 3: COPY8_ATOMIC_3(to,from) return;
933
      case 2: COPY8_ATOMIC_2(to,from) return;
934
      case 1: COPY8_ATOMIC_1(to,from) return;
935
      case 0: return;
936
      default:
937
        from += count_in;
938
        to   += count_in;
939
        while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
940
        return;
941
    }
942
  }
943
  // else {
944
  jbyte* to_bytes   = (jbyte*)to;
945
  jbyte* from_bytes = (jbyte*)from;
946
  size_t len_bytes  = count_in*BytesPerLong;
947
  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
948
#else
949
  size_t count_in = count;
950
  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
951
    if (count_in < 8) {
952
      from += count_in;
953
      to   += count_in;
954
      while (count_in-- > 0)
955
         *(--to) = *(--from); // Copy backwards, areas overlap destructively.
956
      return;
957
    }
958
    // else {
959
    from += count_in-1;
960
    to   += count_in-1;
961
    if (count_in&0x01) {
962
      *(to--) = *(from--);
963
      count_in--;
964
    }
965
    for (; count_in>0; count_in-=2) {
966
      *to     = *from;
967
      *(to-1) = *(from-1);
968
      to     -= 2;
969
      from   -= 2;
970
    }
971
  }
972
  else
973
    pd_aligned_disjoint_words((const HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
974
#endif
975
}
976

977
static void pd_conjoint_oops_atomic(const oop* from, oop* to, size_t count) {
978

979
#ifdef USE_INLINE_ASM
980
  size_t count_in = count;
981
  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
982
    switch (count_in) {
983
      case 4: COPY8_ATOMIC_4(to,from) return;
984
      case 3: COPY8_ATOMIC_3(to,from) return;
985
      case 2: COPY8_ATOMIC_2(to,from) return;
986
      case 1: COPY8_ATOMIC_1(to,from) return;
987
      case 0: return;
988
      default:
989
        from += count_in;
990
        to   += count_in;
991
        while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
992
        return;
993
    }
994
  }
995
  // else
996
  jbyte* to_bytes   = (jbyte*)to;
997
  jbyte* from_bytes = (jbyte*)from;
998
  size_t len_bytes  = count_in*BytesPerOop;
999
  MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
1000
#else
1001
  size_t count_in = count;
1002
  if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
1003
    from += count_in;
1004
    to   += count_in;
1005
    while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
1006
    return;
1007
  }
1008
  // else
1009
  pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
1010
  return;
1011
#endif
1012
}
1013

1014
static void pd_arrayof_conjoint_bytes(const HeapWord* from, HeapWord* to, size_t count) {
1015
  pd_conjoint_bytes_atomic(from, to, count);
1016
}
1017

1018
static void pd_arrayof_conjoint_jshorts(const HeapWord* from, HeapWord* to, size_t count) {
1019
  pd_conjoint_jshorts_atomic((const jshort*)from, (jshort*)to, count);
1020
}
1021

1022
static void pd_arrayof_conjoint_jints(const HeapWord* from, HeapWord* to, size_t count) {
1023
  pd_conjoint_jints_atomic((const jint*)from, (jint*)to, count);
1024
}
1025

1026
static void pd_arrayof_conjoint_jlongs(const HeapWord* from, HeapWord* to, size_t count) {
1027
  pd_conjoint_jlongs_atomic((const jlong*)from, (jlong*)to, count);
1028
}
1029

1030
static void pd_arrayof_conjoint_oops(const HeapWord* from, HeapWord* to, size_t count) {
1031
  pd_conjoint_oops_atomic((const oop*)from, (oop*)to, count);
1032
}
1033

1034
//**********************************************//
1035
//  M E M O R Y   I N I T I A L I S A T I O N   //
1036
//**********************************************//
1037

1038
static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
1039
  // JVM2008: very rare, only in some tests.
1040
#ifdef USE_INLINE_ASM
1041
  // Initialize storage to a given value. Use memset instead of copy loop.
1042
  // For large chunks of memory, exploit special H/W support of z/Architecture:
1043
  // 1) init short piece of memory to page-align address
1044
  // 2) init largest part (all contained full pages) of memory using mvcle instruction.
1045
  //    z/Architecture processors have special H/W support for page-aligned storage
1046
  //    where len is an int multiple of page size. In that case, up to 4 cache lines are
1047
  //    processed in parallel and L1 cache is not polluted.
1048
  // 3) init the remaining piece of memory.
1049
  // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.
1050
  // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
1051

1052
  jbyte*  to_bytes  = (jbyte*)to;
1053
  size_t  len_bytes = count;
1054

1055
  MVCLE_MEMINIT(to_bytes, value, len_bytes)
1056

1057
#else
1058
  // Memset does the best job possible: loop over 256-byte MVCs, with
1059
  // the last MVC EXecuted. With the -mmvcle option, initialization
1060
  // is done using MVCLE -> slight advantage for large areas.
1061
  (void)memset(to, value, count);
1062
#endif
1063
}
1064

1065
static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
1066
  // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.
1067
  // JVM2008: < 4k calls.
1068
  if (value == 0) {
1069
    pd_zero_to_words(tohw, count);
1070
    return;
1071
  }
1072
  if (value == ~(juint)(0)) {
1073
    pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));
1074
    return;
1075
  }
1076
  julong* to = (julong*) tohw;
1077
  julong  v  = ((julong) value << 32) | value;
1078
  while (count-- > 0) {
1079
    *to++ = v;
1080
  }
1081
}
1082

1083
static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
1084
  // JVM2008: very frequent, but virtually all calls are with value == 0.
1085
  pd_fill_to_words(tohw, count, value);
1086
}
1087

1088
//**********************************//
1089
//  M E M O R Y   C L E A R I N G   //
1090
//**********************************//
1091

1092
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1093
// Distinguish between simple and large zero_to_words.
1094
static void pd_zero_to_words(HeapWord* tohw, size_t count) {
1095
  pd_zero_to_bytes(tohw, count*HeapWordSize);
1096
}
1097

1098
static void pd_zero_to_bytes(void* to, size_t count) {
1099
  // JVM2008: some calls (generally), some tests frequent
1100
#ifdef USE_INLINE_ASM
1101
  // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential
1102
  // zeroing of the memory. MVCLE is not fit for that job:
1103
  //   "As observed by other CPUs and by the channel subsystem,
1104
  //    that portion of the first operand which is filled
1105
  //    with the padding byte is not necessarily stored into in
1106
  //    a left-to-right direction and may appear to be stored
1107
  //    into more than once."
1108
  // Therefore, implementation was changed to use (multiple) XC instructions.
1109

1110
  const long line_size = 256;
1111
  jbyte* to_bytes  = (jbyte*)to;
1112
  size_t len_bytes = count;
1113

1114
  if (len_bytes <= line_size) {
1115
    XC_MEMZERO_256(to_bytes, len_bytes);
1116
  } else {
1117
    XC_MEMZERO_ANY(to_bytes, len_bytes);
1118
  }
1119

1120
#else
1121
  // Memset does the best job possible: loop over 256-byte MVCs, with
1122
  // the last MVC EXecuted. With the -mmvcle option, initialization
1123
  // is done using MVCLE -> slight advantage for large areas.
1124
  (void)memset(to, 0, count);
1125
#endif
1126
}
1127

1128
#endif // CPU_S390_COPY_S390_HPP
1129

1130
Product

Resources

Company