Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/jdk17u
Path: blob/master/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
64440 views
1
/*
2
* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
8
*
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
14
*
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
*
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
21
* questions.
22
*
23
*/
24
25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "oops/methodData.hpp"
29
#include "opto/c2_MacroAssembler.hpp"
30
#include "opto/intrinsicnode.hpp"
31
#include "opto/opcodes.hpp"
32
#include "opto/subnode.hpp"
33
#include "runtime/biasedLocking.hpp"
34
#include "runtime/objectMonitor.hpp"
35
#include "runtime/stubRoutines.hpp"
36
37
inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
38
switch (vlen_in_bytes) {
39
case 4: // fall-through
40
case 8: // fall-through
41
case 16: return Assembler::AVX_128bit;
42
case 32: return Assembler::AVX_256bit;
43
case 64: return Assembler::AVX_512bit;
44
45
default: {
46
ShouldNotReachHere();
47
return Assembler::AVX_NoVec;
48
}
49
}
50
}
51
52
void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
53
guarantee(PostLoopMultiversioning, "must be");
54
Assembler::movl(dst, 1);
55
Assembler::shlxl(dst, dst, src);
56
Assembler::decl(dst);
57
Assembler::kmovdl(mask, dst);
58
Assembler::movl(dst, src);
59
}
60
61
void C2_MacroAssembler::restorevectmask(KRegister mask) {
62
guarantee(PostLoopMultiversioning, "must be");
63
Assembler::knotwl(mask, k0);
64
}
65
66
#if INCLUDE_RTM_OPT
67
68
// Update rtm_counters based on abort status
69
// input: abort_status
70
// rtm_counters (RTMLockingCounters*)
71
// flags are killed
72
void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
73
74
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
75
if (PrintPreciseRTMLockingStatistics) {
76
for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
77
Label check_abort;
78
testl(abort_status, (1<<i));
79
jccb(Assembler::equal, check_abort);
80
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
81
bind(check_abort);
82
}
83
}
84
}
85
86
// Branch if (random & (count-1) != 0), count is 2^n
87
// tmp, scr and flags are killed
88
void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
89
assert(tmp == rax, "");
90
assert(scr == rdx, "");
91
rdtsc(); // modifies EDX:EAX
92
andptr(tmp, count-1);
93
jccb(Assembler::notZero, brLabel);
94
}
95
96
// Perform abort ratio calculation, set no_rtm bit if high ratio
97
// input: rtm_counters_Reg (RTMLockingCounters* address)
98
// tmpReg, rtm_counters_Reg and flags are killed
99
void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
100
Register rtm_counters_Reg,
101
RTMLockingCounters* rtm_counters,
102
Metadata* method_data) {
103
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
104
105
if (RTMLockingCalculationDelay > 0) {
106
// Delay calculation
107
movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
108
testptr(tmpReg, tmpReg);
109
jccb(Assembler::equal, L_done);
110
}
111
// Abort ratio calculation only if abort_count > RTMAbortThreshold
112
// Aborted transactions = abort_count * 100
113
// All transactions = total_count * RTMTotalCountIncrRate
114
// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
115
116
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
117
cmpptr(tmpReg, RTMAbortThreshold);
118
jccb(Assembler::below, L_check_always_rtm2);
119
imulptr(tmpReg, tmpReg, 100);
120
121
Register scrReg = rtm_counters_Reg;
122
movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123
imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
124
imulptr(scrReg, scrReg, RTMAbortRatio);
125
cmpptr(tmpReg, scrReg);
126
jccb(Assembler::below, L_check_always_rtm1);
127
if (method_data != NULL) {
128
// set rtm_state to "no rtm" in MDO
129
mov_metadata(tmpReg, method_data);
130
lock();
131
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
132
}
133
jmpb(L_done);
134
bind(L_check_always_rtm1);
135
// Reload RTMLockingCounters* address
136
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
137
bind(L_check_always_rtm2);
138
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
139
cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
140
jccb(Assembler::below, L_done);
141
if (method_data != NULL) {
142
// set rtm_state to "always rtm" in MDO
143
mov_metadata(tmpReg, method_data);
144
lock();
145
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
146
}
147
bind(L_done);
148
}
149
150
// Update counters and perform abort ratio calculation
151
// input: abort_status_Reg
152
// rtm_counters_Reg, flags are killed
153
void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
154
Register rtm_counters_Reg,
155
RTMLockingCounters* rtm_counters,
156
Metadata* method_data,
157
bool profile_rtm) {
158
159
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
160
// update rtm counters based on rax value at abort
161
// reads abort_status_Reg, updates flags
162
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
163
rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
164
if (profile_rtm) {
165
// Save abort status because abort_status_Reg is used by following code.
166
if (RTMRetryCount > 0) {
167
push(abort_status_Reg);
168
}
169
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
170
rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
171
// restore abort status
172
if (RTMRetryCount > 0) {
173
pop(abort_status_Reg);
174
}
175
}
176
}
177
178
// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
179
// inputs: retry_count_Reg
180
// : abort_status_Reg
181
// output: retry_count_Reg decremented by 1
182
// flags are killed
183
void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
184
Label doneRetry;
185
assert(abort_status_Reg == rax, "");
186
// The abort reason bits are in eax (see all states in rtmLocking.hpp)
187
// 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
188
// if reason is in 0x6 and retry count != 0 then retry
189
andptr(abort_status_Reg, 0x6);
190
jccb(Assembler::zero, doneRetry);
191
testl(retry_count_Reg, retry_count_Reg);
192
jccb(Assembler::zero, doneRetry);
193
pause();
194
decrementl(retry_count_Reg);
195
jmp(retryLabel);
196
bind(doneRetry);
197
}
198
199
// Spin and retry if lock is busy,
200
// inputs: box_Reg (monitor address)
201
// : retry_count_Reg
202
// output: retry_count_Reg decremented by 1
203
// : clear z flag if retry count exceeded
204
// tmp_Reg, scr_Reg, flags are killed
205
void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
206
Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
207
Label SpinLoop, SpinExit, doneRetry;
208
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
209
210
testl(retry_count_Reg, retry_count_Reg);
211
jccb(Assembler::zero, doneRetry);
212
decrementl(retry_count_Reg);
213
movptr(scr_Reg, RTMSpinLoopCount);
214
215
bind(SpinLoop);
216
pause();
217
decrementl(scr_Reg);
218
jccb(Assembler::lessEqual, SpinExit);
219
movptr(tmp_Reg, Address(box_Reg, owner_offset));
220
testptr(tmp_Reg, tmp_Reg);
221
jccb(Assembler::notZero, SpinLoop);
222
223
bind(SpinExit);
224
jmp(retryLabel);
225
bind(doneRetry);
226
incrementl(retry_count_Reg); // clear z flag
227
}
228
229
// Use RTM for normal stack locks
230
// Input: objReg (object to lock)
231
void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
232
Register retry_on_abort_count_Reg,
233
RTMLockingCounters* stack_rtm_counters,
234
Metadata* method_data, bool profile_rtm,
235
Label& DONE_LABEL, Label& IsInflated) {
236
assert(UseRTMForStackLocks, "why call this otherwise?");
237
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
238
assert(tmpReg == rax, "");
239
assert(scrReg == rdx, "");
240
Label L_rtm_retry, L_decrement_retry, L_on_abort;
241
242
if (RTMRetryCount > 0) {
243
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
244
bind(L_rtm_retry);
245
}
246
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
247
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
248
jcc(Assembler::notZero, IsInflated);
249
250
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
251
Label L_noincrement;
252
if (RTMTotalCountIncrRate > 1) {
253
// tmpReg, scrReg and flags are killed
254
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
255
}
256
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
257
atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
258
bind(L_noincrement);
259
}
260
xbegin(L_on_abort);
261
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
262
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
263
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
264
jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
265
266
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
267
if (UseRTMXendForLockBusy) {
268
xend();
269
movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
270
jmp(L_decrement_retry);
271
}
272
else {
273
xabort(0);
274
}
275
bind(L_on_abort);
276
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
277
rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
278
}
279
bind(L_decrement_retry);
280
if (RTMRetryCount > 0) {
281
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
282
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
283
}
284
}
285
286
// Use RTM for inflating locks
287
// inputs: objReg (object to lock)
288
// boxReg (on-stack box address (displaced header location) - KILLED)
289
// tmpReg (ObjectMonitor address + markWord::monitor_value)
290
void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
291
Register scrReg, Register retry_on_busy_count_Reg,
292
Register retry_on_abort_count_Reg,
293
RTMLockingCounters* rtm_counters,
294
Metadata* method_data, bool profile_rtm,
295
Label& DONE_LABEL) {
296
assert(UseRTMLocking, "why call this otherwise?");
297
assert(tmpReg == rax, "");
298
assert(scrReg == rdx, "");
299
Label L_rtm_retry, L_decrement_retry, L_on_abort;
300
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
301
302
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
303
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
304
movptr(boxReg, tmpReg); // Save ObjectMonitor address
305
306
if (RTMRetryCount > 0) {
307
movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
308
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
309
bind(L_rtm_retry);
310
}
311
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
312
Label L_noincrement;
313
if (RTMTotalCountIncrRate > 1) {
314
// tmpReg, scrReg and flags are killed
315
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
316
}
317
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
318
atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
319
bind(L_noincrement);
320
}
321
xbegin(L_on_abort);
322
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
323
movptr(tmpReg, Address(tmpReg, owner_offset));
324
testptr(tmpReg, tmpReg);
325
jcc(Assembler::zero, DONE_LABEL);
326
if (UseRTMXendForLockBusy) {
327
xend();
328
jmp(L_decrement_retry);
329
}
330
else {
331
xabort(0);
332
}
333
bind(L_on_abort);
334
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
335
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
336
rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
337
}
338
if (RTMRetryCount > 0) {
339
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
340
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
341
}
342
343
movptr(tmpReg, Address(boxReg, owner_offset)) ;
344
testptr(tmpReg, tmpReg) ;
345
jccb(Assembler::notZero, L_decrement_retry) ;
346
347
// Appears unlocked - try to swing _owner from null to non-null.
348
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
349
#ifdef _LP64
350
Register threadReg = r15_thread;
351
#else
352
get_thread(scrReg);
353
Register threadReg = scrReg;
354
#endif
355
lock();
356
cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
357
358
if (RTMRetryCount > 0) {
359
// success done else retry
360
jccb(Assembler::equal, DONE_LABEL) ;
361
bind(L_decrement_retry);
362
// Spin and retry if lock is busy.
363
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
364
}
365
else {
366
bind(L_decrement_retry);
367
}
368
}
369
370
#endif // INCLUDE_RTM_OPT
371
372
// fast_lock and fast_unlock used by C2
373
374
// Because the transitions from emitted code to the runtime
375
// monitorenter/exit helper stubs are so slow it's critical that
376
// we inline both the stack-locking fast path and the inflated fast path.
377
//
378
// See also: cmpFastLock and cmpFastUnlock.
379
//
380
// What follows is a specialized inline transliteration of the code
381
// in enter() and exit(). If we're concerned about I$ bloat another
382
// option would be to emit TrySlowEnter and TrySlowExit methods
383
// at startup-time. These methods would accept arguments as
384
// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
385
// indications in the icc.ZFlag. fast_lock and fast_unlock would simply
386
// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
387
// In practice, however, the # of lock sites is bounded and is usually small.
388
// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
389
// if the processor uses simple bimodal branch predictors keyed by EIP
390
// Since the helper routines would be called from multiple synchronization
391
// sites.
392
//
393
// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
394
// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
395
// to those specialized methods. That'd give us a mostly platform-independent
396
// implementation that the JITs could optimize and inline at their pleasure.
397
// Done correctly, the only time we'd need to cross to native could would be
398
// to park() or unpark() threads. We'd also need a few more unsafe operators
399
// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
400
// (b) explicit barriers or fence operations.
401
//
402
// TODO:
403
//
404
// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
405
// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
406
// Given TLAB allocation, Self is usually manifested in a register, so passing it into
407
// the lock operators would typically be faster than reifying Self.
408
//
409
// * Ideally I'd define the primitives as:
410
// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
411
// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
412
// Unfortunately ADLC bugs prevent us from expressing the ideal form.
413
// Instead, we're stuck with a rather awkward and brittle register assignments below.
414
// Furthermore the register assignments are overconstrained, possibly resulting in
415
// sub-optimal code near the synchronization site.
416
//
417
// * Eliminate the sp-proximity tests and just use "== Self" tests instead.
418
// Alternately, use a better sp-proximity test.
419
//
420
// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
421
// Either one is sufficient to uniquely identify a thread.
422
// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
423
//
424
// * Intrinsify notify() and notifyAll() for the common cases where the
425
// object is locked by the calling thread but the waitlist is empty.
426
// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
427
//
428
// * use jccb and jmpb instead of jcc and jmp to improve code density.
429
// But beware of excessive branch density on AMD Opterons.
430
//
431
// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
432
// or failure of the fast path. If the fast path fails then we pass
433
// control to the slow path, typically in C. In fast_lock and
434
// fast_unlock we often branch to DONE_LABEL, just to find that C2
435
// will emit a conditional branch immediately after the node.
436
// So we have branches to branches and lots of ICC.ZF games.
437
// Instead, it might be better to have C2 pass a "FailureLabel"
438
// into fast_lock and fast_unlock. In the case of success, control
439
// will drop through the node. ICC.ZF is undefined at exit.
440
// In the case of failure, the node will branch directly to the
441
// FailureLabel
442
443
444
// obj: object to lock
445
// box: on-stack box address (displaced header location) - KILLED
446
// rax,: tmp -- KILLED
447
// scr: tmp -- KILLED
448
void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
449
Register scrReg, Register cx1Reg, Register cx2Reg,
450
BiasedLockingCounters* counters,
451
RTMLockingCounters* rtm_counters,
452
RTMLockingCounters* stack_rtm_counters,
453
Metadata* method_data,
454
bool use_rtm, bool profile_rtm) {
455
// Ensure the register assignments are disjoint
456
assert(tmpReg == rax, "");
457
458
if (use_rtm) {
459
assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
460
} else {
461
assert(cx2Reg == noreg, "");
462
assert_different_registers(objReg, boxReg, tmpReg, scrReg);
463
}
464
465
if (counters != NULL) {
466
atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
467
}
468
469
// Possible cases that we'll encounter in fast_lock
470
// ------------------------------------------------
471
// * Inflated
472
// -- unlocked
473
// -- Locked
474
// = by self
475
// = by other
476
// * biased
477
// -- by Self
478
// -- by other
479
// * neutral
480
// * stack-locked
481
// -- by self
482
// = sp-proximity test hits
483
// = sp-proximity test generates false-negative
484
// -- by other
485
//
486
487
Label IsInflated, DONE_LABEL;
488
489
if (DiagnoseSyncOnValueBasedClasses != 0) {
490
load_klass(tmpReg, objReg, cx1Reg);
491
movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
492
testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
493
jcc(Assembler::notZero, DONE_LABEL);
494
}
495
496
// it's stack-locked, biased or neutral
497
// TODO: optimize away redundant LDs of obj->mark and improve the markword triage
498
// order to reduce the number of conditional branches in the most common cases.
499
// Beware -- there's a subtle invariant that fetch of the markword
500
// at [FETCH], below, will never observe a biased encoding (*101b).
501
// If this invariant is not held we risk exclusion (safety) failure.
502
if (UseBiasedLocking && !UseOptoBiasInlining) {
503
biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
504
}
505
506
#if INCLUDE_RTM_OPT
507
if (UseRTMForStackLocks && use_rtm) {
508
rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
509
stack_rtm_counters, method_data, profile_rtm,
510
DONE_LABEL, IsInflated);
511
}
512
#endif // INCLUDE_RTM_OPT
513
514
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
515
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
516
jccb(Assembler::notZero, IsInflated);
517
518
// Attempt stack-locking ...
519
orptr (tmpReg, markWord::unlocked_value);
520
movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
521
lock();
522
cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
523
if (counters != NULL) {
524
cond_inc32(Assembler::equal,
525
ExternalAddress((address)counters->fast_path_entry_count_addr()));
526
}
527
jcc(Assembler::equal, DONE_LABEL); // Success
528
529
// Recursive locking.
530
// The object is stack-locked: markword contains stack pointer to BasicLock.
531
// Locked by current thread if difference with current SP is less than one page.
532
subptr(tmpReg, rsp);
533
// Next instruction set ZFlag == 1 (Success) if difference is less then one page.
534
andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
535
movptr(Address(boxReg, 0), tmpReg);
536
if (counters != NULL) {
537
cond_inc32(Assembler::equal,
538
ExternalAddress((address)counters->fast_path_entry_count_addr()));
539
}
540
jmp(DONE_LABEL);
541
542
bind(IsInflated);
543
// The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
544
545
#if INCLUDE_RTM_OPT
546
// Use the same RTM locking code in 32- and 64-bit VM.
547
if (use_rtm) {
548
rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
549
rtm_counters, method_data, profile_rtm, DONE_LABEL);
550
} else {
551
#endif // INCLUDE_RTM_OPT
552
553
#ifndef _LP64
554
// The object is inflated.
555
556
// boxReg refers to the on-stack BasicLock in the current frame.
557
// We'd like to write:
558
// set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
559
// This is convenient but results a ST-before-CAS penalty. The following CAS suffers
560
// additional latency as we have another ST in the store buffer that must drain.
561
562
// avoid ST-before-CAS
563
// register juggle because we need tmpReg for cmpxchgptr below
564
movptr(scrReg, boxReg);
565
movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
566
567
// Optimistic form: consider XORL tmpReg,tmpReg
568
movptr(tmpReg, NULL_WORD);
569
570
// Appears unlocked - try to swing _owner from null to non-null.
571
// Ideally, I'd manifest "Self" with get_thread and then attempt
572
// to CAS the register containing Self into m->Owner.
573
// But we don't have enough registers, so instead we can either try to CAS
574
// rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
575
// we later store "Self" into m->Owner. Transiently storing a stack address
576
// (rsp or the address of the box) into m->owner is harmless.
577
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
578
lock();
579
cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
580
movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
581
// If we weren't able to swing _owner from NULL to the BasicLock
582
// then take the slow path.
583
jccb (Assembler::notZero, DONE_LABEL);
584
// update _owner from BasicLock to thread
585
get_thread (scrReg); // beware: clobbers ICCs
586
movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
587
xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
588
589
// If the CAS fails we can either retry or pass control to the slow path.
590
// We use the latter tactic.
591
// Pass the CAS result in the icc.ZFlag into DONE_LABEL
592
// If the CAS was successful ...
593
// Self has acquired the lock
594
// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
595
// Intentional fall-through into DONE_LABEL ...
596
#else // _LP64
597
// It's inflated and we use scrReg for ObjectMonitor* in this section.
598
movq(scrReg, tmpReg);
599
xorq(tmpReg, tmpReg);
600
lock();
601
cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
602
// Unconditionally set box->_displaced_header = markWord::unused_mark().
603
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
604
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
605
// Propagate ICC.ZF from CAS above into DONE_LABEL.
606
jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success)
607
608
cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock)
609
jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail)
610
incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
611
xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
612
#endif // _LP64
613
#if INCLUDE_RTM_OPT
614
} // use_rtm()
615
#endif
616
// DONE_LABEL is a hot target - we'd really like to place it at the
617
// start of cache line by padding with NOPs.
618
// See the AMD and Intel software optimization manuals for the
619
// most efficient "long" NOP encodings.
620
// Unfortunately none of our alignment mechanisms suffice.
621
bind(DONE_LABEL);
622
623
// At DONE_LABEL the icc ZFlag is set as follows ...
624
// fast_unlock uses the same protocol.
625
// ZFlag == 1 -> Success
626
// ZFlag == 0 -> Failure - force control through the slow path
627
}
628
629
// obj: object to unlock
630
// box: box address (displaced header location), killed. Must be EAX.
631
// tmp: killed, cannot be obj nor box.
632
//
633
// Some commentary on balanced locking:
634
//
635
// fast_lock and fast_unlock are emitted only for provably balanced lock sites.
636
// Methods that don't have provably balanced locking are forced to run in the
637
// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
638
// The interpreter provides two properties:
639
// I1: At return-time the interpreter automatically and quietly unlocks any
640
// objects acquired the current activation (frame). Recall that the
641
// interpreter maintains an on-stack list of locks currently held by
642
// a frame.
643
// I2: If a method attempts to unlock an object that is not held by the
644
// the frame the interpreter throws IMSX.
645
//
646
// Lets say A(), which has provably balanced locking, acquires O and then calls B().
647
// B() doesn't have provably balanced locking so it runs in the interpreter.
648
// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
649
// is still locked by A().
650
//
651
// The only other source of unbalanced locking would be JNI. The "Java Native Interface:
652
// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
653
// should not be unlocked by "normal" java-level locking and vice-versa. The specification
654
// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
655
// Arguably given that the spec legislates the JNI case as undefined our implementation
656
// could reasonably *avoid* checking owner in fast_unlock().
657
// In the interest of performance we elide m->Owner==Self check in unlock.
658
// A perfectly viable alternative is to elide the owner check except when
659
// Xcheck:jni is enabled.
660
661
void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
662
assert(boxReg == rax, "");
663
assert_different_registers(objReg, boxReg, tmpReg);
664
665
Label DONE_LABEL, Stacked, CheckSucc;
666
667
// Critically, the biased locking test must have precedence over
668
// and appear before the (box->dhw == 0) recursive stack-lock test.
669
if (UseBiasedLocking && !UseOptoBiasInlining) {
670
biased_locking_exit(objReg, tmpReg, DONE_LABEL);
671
}
672
673
#if INCLUDE_RTM_OPT
674
if (UseRTMForStackLocks && use_rtm) {
675
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
676
Label L_regular_unlock;
677
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
678
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
679
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
680
jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
681
xend(); // otherwise end...
682
jmp(DONE_LABEL); // ... and we're done
683
bind(L_regular_unlock);
684
}
685
#endif
686
687
cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
688
jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
689
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
690
testptr(tmpReg, markWord::monitor_value); // Inflated?
691
jccb (Assembler::zero, Stacked);
692
693
// It's inflated.
694
#if INCLUDE_RTM_OPT
695
if (use_rtm) {
696
Label L_regular_inflated_unlock;
697
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
698
movptr(boxReg, Address(tmpReg, owner_offset));
699
testptr(boxReg, boxReg);
700
jccb(Assembler::notZero, L_regular_inflated_unlock);
701
xend();
702
jmpb(DONE_LABEL);
703
bind(L_regular_inflated_unlock);
704
}
705
#endif
706
707
// Despite our balanced locking property we still check that m->_owner == Self
708
// as java routines or native JNI code called by this thread might
709
// have released the lock.
710
// Refer to the comments in synchronizer.cpp for how we might encode extra
711
// state in _succ so we can avoid fetching EntryList|cxq.
712
//
713
// If there's no contention try a 1-0 exit. That is, exit without
714
// a costly MEMBAR or CAS. See synchronizer.cpp for details on how
715
// we detect and recover from the race that the 1-0 exit admits.
716
//
717
// Conceptually fast_unlock() must execute a STST|LDST "release" barrier
718
// before it STs null into _owner, releasing the lock. Updates
719
// to data protected by the critical section must be visible before
720
// we drop the lock (and thus before any other thread could acquire
721
// the lock and observe the fields protected by the lock).
722
// IA32's memory-model is SPO, so STs are ordered with respect to
723
// each other and there's no need for an explicit barrier (fence).
724
// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
725
#ifndef _LP64
726
get_thread (boxReg);
727
728
// Note that we could employ various encoding schemes to reduce
729
// the number of loads below (currently 4) to just 2 or 3.
730
// Refer to the comments in synchronizer.cpp.
731
// In practice the chain of fetches doesn't seem to impact performance, however.
732
xorptr(boxReg, boxReg);
733
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
734
jccb (Assembler::notZero, DONE_LABEL);
735
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
736
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
737
jccb (Assembler::notZero, CheckSucc);
738
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
739
jmpb (DONE_LABEL);
740
741
bind (Stacked);
742
// It's not inflated and it's not recursively stack-locked and it's not biased.
743
// It must be stack-locked.
744
// Try to reset the header to displaced header.
745
// The "box" value on the stack is stable, so we can reload
746
// and be assured we observe the same value as above.
747
movptr(tmpReg, Address(boxReg, 0));
748
lock();
749
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
750
// Intention fall-thru into DONE_LABEL
751
752
// DONE_LABEL is a hot target - we'd really like to place it at the
753
// start of cache line by padding with NOPs.
754
// See the AMD and Intel software optimization manuals for the
755
// most efficient "long" NOP encodings.
756
// Unfortunately none of our alignment mechanisms suffice.
757
bind (CheckSucc);
758
#else // _LP64
759
// It's inflated
760
Label LNotRecursive, LSuccess, LGoSlowPath;
761
762
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
763
jccb(Assembler::equal, LNotRecursive);
764
765
// Recursive inflated unlock
766
decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
767
jmpb(LSuccess);
768
769
bind(LNotRecursive);
770
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
771
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
772
jccb (Assembler::notZero, CheckSucc);
773
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
774
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
775
jmpb (DONE_LABEL);
776
777
// Try to avoid passing control into the slow_path ...
778
bind (CheckSucc);
779
780
// The following optional optimization can be elided if necessary
781
// Effectively: if (succ == null) goto slow path
782
// The code reduces the window for a race, however,
783
// and thus benefits performance.
784
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
785
jccb (Assembler::zero, LGoSlowPath);
786
787
xorptr(boxReg, boxReg);
788
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
789
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
790
791
// Memory barrier/fence
792
// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
793
// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
794
// This is faster on Nehalem and AMD Shanghai/Barcelona.
795
// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
796
// We might also restructure (ST Owner=0;barrier;LD _Succ) to
797
// (mov box,0; xchgq box, &m->Owner; LD _succ) .
798
lock(); addl(Address(rsp, 0), 0);
799
800
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
801
jccb (Assembler::notZero, LSuccess);
802
803
// Rare inopportune interleaving - race.
804
// The successor vanished in the small window above.
805
// The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
806
// We need to ensure progress and succession.
807
// Try to reacquire the lock.
808
// If that fails then the new owner is responsible for succession and this
809
// thread needs to take no further action and can exit via the fast path (success).
810
// If the re-acquire succeeds then pass control into the slow path.
811
// As implemented, this latter mode is horrible because we generated more
812
// coherence traffic on the lock *and* artifically extended the critical section
813
// length while by virtue of passing control into the slow path.
814
815
// box is really RAX -- the following CMPXCHG depends on that binding
816
// cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
817
lock();
818
cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
819
// There's no successor so we tried to regrab the lock.
820
// If that didn't work, then another thread grabbed the
821
// lock so we're done (and exit was a success).
822
jccb (Assembler::notEqual, LSuccess);
823
// Intentional fall-through into slow path
824
825
bind (LGoSlowPath);
826
orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
827
jmpb (DONE_LABEL);
828
829
bind (LSuccess);
830
testl (boxReg, 0); // set ICC.ZF=1 to indicate success
831
jmpb (DONE_LABEL);
832
833
bind (Stacked);
834
movptr(tmpReg, Address (boxReg, 0)); // re-fetch
835
lock();
836
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
837
838
#endif
839
bind(DONE_LABEL);
840
}
841
842
//-------------------------------------------------------------------------------------------
843
// Generic instructions support for use in .ad files C2 code generation
844
845
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
846
if (dst != src) {
847
movdqu(dst, src);
848
}
849
if (opcode == Op_AbsVD) {
850
andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
851
} else {
852
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
853
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
854
}
855
}
856
857
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
858
if (opcode == Op_AbsVD) {
859
vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
860
} else {
861
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
862
vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
863
}
864
}
865
866
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
867
if (dst != src) {
868
movdqu(dst, src);
869
}
870
if (opcode == Op_AbsVF) {
871
andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
872
} else {
873
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
874
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
875
}
876
}
877
878
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
879
if (opcode == Op_AbsVF) {
880
vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
881
} else {
882
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
883
vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
884
}
885
}
886
887
void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
888
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
889
assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
890
891
if (opcode == Op_MinV) {
892
if (elem_bt == T_BYTE) {
893
pminsb(dst, src);
894
} else if (elem_bt == T_SHORT) {
895
pminsw(dst, src);
896
} else if (elem_bt == T_INT) {
897
pminsd(dst, src);
898
} else {
899
assert(elem_bt == T_LONG, "required");
900
assert(tmp == xmm0, "required");
901
assert_different_registers(dst, src, tmp);
902
movdqu(xmm0, dst);
903
pcmpgtq(xmm0, src);
904
blendvpd(dst, src); // xmm0 as mask
905
}
906
} else { // opcode == Op_MaxV
907
if (elem_bt == T_BYTE) {
908
pmaxsb(dst, src);
909
} else if (elem_bt == T_SHORT) {
910
pmaxsw(dst, src);
911
} else if (elem_bt == T_INT) {
912
pmaxsd(dst, src);
913
} else {
914
assert(elem_bt == T_LONG, "required");
915
assert(tmp == xmm0, "required");
916
assert_different_registers(dst, src, tmp);
917
movdqu(xmm0, src);
918
pcmpgtq(xmm0, dst);
919
blendvpd(dst, src); // xmm0 as mask
920
}
921
}
922
}
923
924
void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
925
XMMRegister dst, XMMRegister src1, XMMRegister src2,
926
int vlen_enc) {
927
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
928
929
if (opcode == Op_MinV) {
930
if (elem_bt == T_BYTE) {
931
vpminsb(dst, src1, src2, vlen_enc);
932
} else if (elem_bt == T_SHORT) {
933
vpminsw(dst, src1, src2, vlen_enc);
934
} else if (elem_bt == T_INT) {
935
vpminsd(dst, src1, src2, vlen_enc);
936
} else {
937
assert(elem_bt == T_LONG, "required");
938
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
939
vpminsq(dst, src1, src2, vlen_enc);
940
} else {
941
assert_different_registers(dst, src1, src2);
942
vpcmpgtq(dst, src1, src2, vlen_enc);
943
vblendvpd(dst, src1, src2, dst, vlen_enc);
944
}
945
}
946
} else { // opcode == Op_MaxV
947
if (elem_bt == T_BYTE) {
948
vpmaxsb(dst, src1, src2, vlen_enc);
949
} else if (elem_bt == T_SHORT) {
950
vpmaxsw(dst, src1, src2, vlen_enc);
951
} else if (elem_bt == T_INT) {
952
vpmaxsd(dst, src1, src2, vlen_enc);
953
} else {
954
assert(elem_bt == T_LONG, "required");
955
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
956
vpmaxsq(dst, src1, src2, vlen_enc);
957
} else {
958
assert_different_registers(dst, src1, src2);
959
vpcmpgtq(dst, src1, src2, vlen_enc);
960
vblendvpd(dst, src2, src1, dst, vlen_enc);
961
}
962
}
963
}
964
}
965
966
// Float/Double min max
967
968
void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
969
XMMRegister dst, XMMRegister a, XMMRegister b,
970
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
971
int vlen_enc) {
972
assert(UseAVX > 0, "required");
973
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
974
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
975
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
976
assert_different_registers(a, b, tmp, atmp, btmp);
977
978
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
979
bool is_double_word = is_double_word_type(elem_bt);
980
981
if (!is_double_word && is_min) {
982
vblendvps(atmp, a, b, a, vlen_enc);
983
vblendvps(btmp, b, a, a, vlen_enc);
984
vminps(tmp, atmp, btmp, vlen_enc);
985
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
986
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
987
} else if (!is_double_word && !is_min) {
988
vblendvps(btmp, b, a, b, vlen_enc);
989
vblendvps(atmp, a, b, b, vlen_enc);
990
vmaxps(tmp, atmp, btmp, vlen_enc);
991
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
992
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
993
} else if (is_double_word && is_min) {
994
vblendvpd(atmp, a, b, a, vlen_enc);
995
vblendvpd(btmp, b, a, a, vlen_enc);
996
vminpd(tmp, atmp, btmp, vlen_enc);
997
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
998
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
999
} else {
1000
assert(is_double_word && !is_min, "sanity");
1001
vblendvpd(btmp, b, a, b, vlen_enc);
1002
vblendvpd(atmp, a, b, b, vlen_enc);
1003
vmaxpd(tmp, atmp, btmp, vlen_enc);
1004
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1006
}
1007
}
1008
1009
void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1010
XMMRegister dst, XMMRegister a, XMMRegister b,
1011
KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1012
int vlen_enc) {
1013
assert(UseAVX > 2, "required");
1014
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1015
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1016
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1017
assert_different_registers(dst, a, b, atmp, btmp);
1018
1019
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1020
bool is_double_word = is_double_word_type(elem_bt);
1021
bool merge = true;
1022
1023
if (!is_double_word && is_min) {
1024
evpmovd2m(ktmp, a, vlen_enc);
1025
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027
vminps(dst, atmp, btmp, vlen_enc);
1028
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030
} else if (!is_double_word && !is_min) {
1031
evpmovd2m(ktmp, b, vlen_enc);
1032
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1033
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1034
vmaxps(dst, atmp, btmp, vlen_enc);
1035
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1037
} else if (is_double_word && is_min) {
1038
evpmovq2m(ktmp, a, vlen_enc);
1039
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041
vminpd(dst, atmp, btmp, vlen_enc);
1042
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044
} else {
1045
assert(is_double_word && !is_min, "sanity");
1046
evpmovq2m(ktmp, b, vlen_enc);
1047
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1048
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1049
vmaxpd(dst, atmp, btmp, vlen_enc);
1050
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1051
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1052
}
1053
}
1054
1055
// Float/Double signum
1056
void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1057
XMMRegister zero, XMMRegister one,
1058
Register scratch) {
1059
assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1060
1061
Label DONE_LABEL;
1062
1063
if (opcode == Op_SignumF) {
1064
assert(UseSSE > 0, "required");
1065
ucomiss(dst, zero);
1066
jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1067
jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1068
movflt(dst, one);
1069
jcc(Assembler::above, DONE_LABEL);
1070
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1071
} else if (opcode == Op_SignumD) {
1072
assert(UseSSE > 1, "required");
1073
ucomisd(dst, zero);
1074
jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1075
jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1076
movdbl(dst, one);
1077
jcc(Assembler::above, DONE_LABEL);
1078
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1079
}
1080
1081
bind(DONE_LABEL);
1082
}
1083
1084
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1085
if (sign) {
1086
pmovsxbw(dst, src);
1087
} else {
1088
pmovzxbw(dst, src);
1089
}
1090
}
1091
1092
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093
if (sign) {
1094
vpmovsxbw(dst, src, vector_len);
1095
} else {
1096
vpmovzxbw(dst, src, vector_len);
1097
}
1098
}
1099
1100
void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1101
if (sign) {
1102
vpmovsxbd(dst, src, vector_len);
1103
} else {
1104
vpmovzxbd(dst, src, vector_len);
1105
}
1106
}
1107
1108
void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1109
if (sign) {
1110
vpmovsxwd(dst, src, vector_len);
1111
} else {
1112
vpmovzxwd(dst, src, vector_len);
1113
}
1114
}
1115
1116
void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1117
int shift, int vector_len) {
1118
if (opcode == Op_RotateLeftV) {
1119
if (etype == T_INT) {
1120
evprold(dst, src, shift, vector_len);
1121
} else {
1122
assert(etype == T_LONG, "expected type T_LONG");
1123
evprolq(dst, src, shift, vector_len);
1124
}
1125
} else {
1126
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1127
if (etype == T_INT) {
1128
evprord(dst, src, shift, vector_len);
1129
} else {
1130
assert(etype == T_LONG, "expected type T_LONG");
1131
evprorq(dst, src, shift, vector_len);
1132
}
1133
}
1134
}
1135
1136
void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1137
XMMRegister shift, int vector_len) {
1138
if (opcode == Op_RotateLeftV) {
1139
if (etype == T_INT) {
1140
evprolvd(dst, src, shift, vector_len);
1141
} else {
1142
assert(etype == T_LONG, "expected type T_LONG");
1143
evprolvq(dst, src, shift, vector_len);
1144
}
1145
} else {
1146
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1147
if (etype == T_INT) {
1148
evprorvd(dst, src, shift, vector_len);
1149
} else {
1150
assert(etype == T_LONG, "expected type T_LONG");
1151
evprorvq(dst, src, shift, vector_len);
1152
}
1153
}
1154
}
1155
1156
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1157
if (opcode == Op_RShiftVI) {
1158
psrad(dst, shift);
1159
} else if (opcode == Op_LShiftVI) {
1160
pslld(dst, shift);
1161
} else {
1162
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1163
psrld(dst, shift);
1164
}
1165
}
1166
1167
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1168
switch (opcode) {
1169
case Op_RShiftVI: psrad(dst, shift); break;
1170
case Op_LShiftVI: pslld(dst, shift); break;
1171
case Op_URShiftVI: psrld(dst, shift); break;
1172
1173
default: assert(false, "%s", NodeClassNames[opcode]);
1174
}
1175
}
1176
1177
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1178
if (opcode == Op_RShiftVI) {
1179
vpsrad(dst, nds, shift, vector_len);
1180
} else if (opcode == Op_LShiftVI) {
1181
vpslld(dst, nds, shift, vector_len);
1182
} else {
1183
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1184
vpsrld(dst, nds, shift, vector_len);
1185
}
1186
}
1187
1188
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1189
switch (opcode) {
1190
case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1191
case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1192
case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1193
1194
default: assert(false, "%s", NodeClassNames[opcode]);
1195
}
1196
}
1197
1198
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1199
switch (opcode) {
1200
case Op_RShiftVB: // fall-through
1201
case Op_RShiftVS: psraw(dst, shift); break;
1202
1203
case Op_LShiftVB: // fall-through
1204
case Op_LShiftVS: psllw(dst, shift); break;
1205
1206
case Op_URShiftVS: // fall-through
1207
case Op_URShiftVB: psrlw(dst, shift); break;
1208
1209
default: assert(false, "%s", NodeClassNames[opcode]);
1210
}
1211
}
1212
1213
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214
switch (opcode) {
1215
case Op_RShiftVB: // fall-through
1216
case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1217
1218
case Op_LShiftVB: // fall-through
1219
case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1220
1221
case Op_URShiftVS: // fall-through
1222
case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1223
1224
default: assert(false, "%s", NodeClassNames[opcode]);
1225
}
1226
}
1227
1228
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1229
switch (opcode) {
1230
case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1231
case Op_LShiftVL: psllq(dst, shift); break;
1232
case Op_URShiftVL: psrlq(dst, shift); break;
1233
1234
default: assert(false, "%s", NodeClassNames[opcode]);
1235
}
1236
}
1237
1238
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1239
if (opcode == Op_RShiftVL) {
1240
psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1241
} else if (opcode == Op_LShiftVL) {
1242
psllq(dst, shift);
1243
} else {
1244
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1245
psrlq(dst, shift);
1246
}
1247
}
1248
1249
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250
switch (opcode) {
1251
case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1252
case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1253
case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1254
1255
default: assert(false, "%s", NodeClassNames[opcode]);
1256
}
1257
}
1258
1259
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1260
if (opcode == Op_RShiftVL) {
1261
evpsraq(dst, nds, shift, vector_len);
1262
} else if (opcode == Op_LShiftVL) {
1263
vpsllq(dst, nds, shift, vector_len);
1264
} else {
1265
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1266
vpsrlq(dst, nds, shift, vector_len);
1267
}
1268
}
1269
1270
void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1271
switch (opcode) {
1272
case Op_RShiftVB: // fall-through
1273
case Op_RShiftVS: // fall-through
1274
case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1275
1276
case Op_LShiftVB: // fall-through
1277
case Op_LShiftVS: // fall-through
1278
case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1279
1280
case Op_URShiftVB: // fall-through
1281
case Op_URShiftVS: // fall-through
1282
case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1283
1284
default: assert(false, "%s", NodeClassNames[opcode]);
1285
}
1286
}
1287
1288
void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1289
switch (opcode) {
1290
case Op_RShiftVB: // fall-through
1291
case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1292
1293
case Op_LShiftVB: // fall-through
1294
case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1295
1296
case Op_URShiftVB: // fall-through
1297
case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1298
1299
default: assert(false, "%s", NodeClassNames[opcode]);
1300
}
1301
}
1302
1303
void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1304
assert(UseAVX >= 2, "required");
1305
switch (opcode) {
1306
case Op_RShiftVL: {
1307
if (UseAVX > 2) {
1308
assert(tmp == xnoreg, "not used");
1309
if (!VM_Version::supports_avx512vl()) {
1310
vlen_enc = Assembler::AVX_512bit;
1311
}
1312
evpsravq(dst, src, shift, vlen_enc);
1313
} else {
1314
vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1315
vpsrlvq(dst, src, shift, vlen_enc);
1316
vpsrlvq(tmp, tmp, shift, vlen_enc);
1317
vpxor(dst, dst, tmp, vlen_enc);
1318
vpsubq(dst, dst, tmp, vlen_enc);
1319
}
1320
break;
1321
}
1322
case Op_LShiftVL: {
1323
assert(tmp == xnoreg, "not used");
1324
vpsllvq(dst, src, shift, vlen_enc);
1325
break;
1326
}
1327
case Op_URShiftVL: {
1328
assert(tmp == xnoreg, "not used");
1329
vpsrlvq(dst, src, shift, vlen_enc);
1330
break;
1331
}
1332
default: assert(false, "%s", NodeClassNames[opcode]);
1333
}
1334
}
1335
1336
// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1337
void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1338
assert(opcode == Op_LShiftVB ||
1339
opcode == Op_RShiftVB ||
1340
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1341
bool sign = (opcode != Op_URShiftVB);
1342
assert(vector_len == 0, "required");
1343
vextendbd(sign, dst, src, 1);
1344
vpmovzxbd(vtmp, shift, 1);
1345
varshiftd(opcode, dst, dst, vtmp, 1);
1346
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1347
vextracti128_high(vtmp, dst);
1348
vpackusdw(dst, dst, vtmp, 0);
1349
}
1350
1351
// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1352
void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1353
assert(opcode == Op_LShiftVB ||
1354
opcode == Op_RShiftVB ||
1355
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1356
bool sign = (opcode != Op_URShiftVB);
1357
int ext_vector_len = vector_len + 1;
1358
vextendbw(sign, dst, src, ext_vector_len);
1359
vpmovzxbw(vtmp, shift, ext_vector_len);
1360
varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1361
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1362
if (vector_len == 0) {
1363
vextracti128_high(vtmp, dst);
1364
vpackuswb(dst, dst, vtmp, vector_len);
1365
} else {
1366
vextracti64x4_high(vtmp, dst);
1367
vpackuswb(dst, dst, vtmp, vector_len);
1368
vpermq(dst, dst, 0xD8, vector_len);
1369
}
1370
}
1371
1372
void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1373
switch(typ) {
1374
case T_BYTE:
1375
pinsrb(dst, val, idx);
1376
break;
1377
case T_SHORT:
1378
pinsrw(dst, val, idx);
1379
break;
1380
case T_INT:
1381
pinsrd(dst, val, idx);
1382
break;
1383
case T_LONG:
1384
pinsrq(dst, val, idx);
1385
break;
1386
default:
1387
assert(false,"Should not reach here.");
1388
break;
1389
}
1390
}
1391
1392
void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1393
switch(typ) {
1394
case T_BYTE:
1395
vpinsrb(dst, src, val, idx);
1396
break;
1397
case T_SHORT:
1398
vpinsrw(dst, src, val, idx);
1399
break;
1400
case T_INT:
1401
vpinsrd(dst, src, val, idx);
1402
break;
1403
case T_LONG:
1404
vpinsrq(dst, src, val, idx);
1405
break;
1406
default:
1407
assert(false,"Should not reach here.");
1408
break;
1409
}
1410
}
1411
1412
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1413
switch(typ) {
1414
case T_INT:
1415
vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1416
break;
1417
case T_FLOAT:
1418
vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1419
break;
1420
case T_LONG:
1421
vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1422
break;
1423
case T_DOUBLE:
1424
vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1425
break;
1426
default:
1427
assert(false,"Should not reach here.");
1428
break;
1429
}
1430
}
1431
1432
void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1433
switch(typ) {
1434
case T_INT:
1435
evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1436
break;
1437
case T_FLOAT:
1438
evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1439
break;
1440
case T_LONG:
1441
evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1442
break;
1443
case T_DOUBLE:
1444
evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1445
break;
1446
default:
1447
assert(false,"Should not reach here.");
1448
break;
1449
}
1450
}
1451
1452
void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1453
switch(typ) {
1454
case T_INT:
1455
evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1456
break;
1457
case T_FLOAT:
1458
evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1459
break;
1460
case T_LONG:
1461
evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1462
break;
1463
case T_DOUBLE:
1464
evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1465
break;
1466
default:
1467
assert(false,"Should not reach here.");
1468
break;
1469
}
1470
}
1471
1472
void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1473
if (vlen_in_bytes <= 16) {
1474
pxor (dst, dst);
1475
psubb(dst, src);
1476
switch (elem_bt) {
1477
case T_BYTE: /* nothing to do */ break;
1478
case T_SHORT: pmovsxbw(dst, dst); break;
1479
case T_INT: pmovsxbd(dst, dst); break;
1480
case T_FLOAT: pmovsxbd(dst, dst); break;
1481
case T_LONG: pmovsxbq(dst, dst); break;
1482
case T_DOUBLE: pmovsxbq(dst, dst); break;
1483
1484
default: assert(false, "%s", type2name(elem_bt));
1485
}
1486
} else {
1487
assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1488
int vlen_enc = vector_length_encoding(vlen_in_bytes);
1489
1490
vpxor (dst, dst, dst, vlen_enc);
1491
vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1492
1493
switch (elem_bt) {
1494
case T_BYTE: /* nothing to do */ break;
1495
case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1496
case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1497
case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1498
case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1499
case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1500
1501
default: assert(false, "%s", type2name(elem_bt));
1502
}
1503
}
1504
}
1505
1506
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1507
ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1508
if (vlen_in_bytes == 4) {
1509
movdl(dst, addr);
1510
} else if (vlen_in_bytes == 8) {
1511
movq(dst, addr);
1512
} else if (vlen_in_bytes == 16) {
1513
movdqu(dst, addr, scratch);
1514
} else if (vlen_in_bytes == 32) {
1515
vmovdqu(dst, addr, scratch);
1516
} else {
1517
assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1518
evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1519
}
1520
}
1521
1522
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1523
1524
void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1525
int vector_len = Assembler::AVX_128bit;
1526
1527
switch (opcode) {
1528
case Op_AndReductionV: pand(dst, src); break;
1529
case Op_OrReductionV: por (dst, src); break;
1530
case Op_XorReductionV: pxor(dst, src); break;
1531
case Op_MinReductionV:
1532
switch (typ) {
1533
case T_BYTE: pminsb(dst, src); break;
1534
case T_SHORT: pminsw(dst, src); break;
1535
case T_INT: pminsd(dst, src); break;
1536
case T_LONG: assert(UseAVX > 2, "required");
1537
vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1538
default: assert(false, "wrong type");
1539
}
1540
break;
1541
case Op_MaxReductionV:
1542
switch (typ) {
1543
case T_BYTE: pmaxsb(dst, src); break;
1544
case T_SHORT: pmaxsw(dst, src); break;
1545
case T_INT: pmaxsd(dst, src); break;
1546
case T_LONG: assert(UseAVX > 2, "required");
1547
vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1548
default: assert(false, "wrong type");
1549
}
1550
break;
1551
case Op_AddReductionVF: addss(dst, src); break;
1552
case Op_AddReductionVD: addsd(dst, src); break;
1553
case Op_AddReductionVI:
1554
switch (typ) {
1555
case T_BYTE: paddb(dst, src); break;
1556
case T_SHORT: paddw(dst, src); break;
1557
case T_INT: paddd(dst, src); break;
1558
default: assert(false, "wrong type");
1559
}
1560
break;
1561
case Op_AddReductionVL: paddq(dst, src); break;
1562
case Op_MulReductionVF: mulss(dst, src); break;
1563
case Op_MulReductionVD: mulsd(dst, src); break;
1564
case Op_MulReductionVI:
1565
switch (typ) {
1566
case T_SHORT: pmullw(dst, src); break;
1567
case T_INT: pmulld(dst, src); break;
1568
default: assert(false, "wrong type");
1569
}
1570
break;
1571
case Op_MulReductionVL: assert(UseAVX > 2, "required");
1572
vpmullq(dst, dst, src, vector_len); break;
1573
default: assert(false, "wrong opcode");
1574
}
1575
}
1576
1577
void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1578
int vector_len = Assembler::AVX_256bit;
1579
1580
switch (opcode) {
1581
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1582
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1583
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1584
case Op_MinReductionV:
1585
switch (typ) {
1586
case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1587
case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1588
case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1589
case T_LONG: assert(UseAVX > 2, "required");
1590
vpminsq(dst, src1, src2, vector_len); break;
1591
default: assert(false, "wrong type");
1592
}
1593
break;
1594
case Op_MaxReductionV:
1595
switch (typ) {
1596
case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1597
case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1598
case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1599
case T_LONG: assert(UseAVX > 2, "required");
1600
vpmaxsq(dst, src1, src2, vector_len); break;
1601
default: assert(false, "wrong type");
1602
}
1603
break;
1604
case Op_AddReductionVI:
1605
switch (typ) {
1606
case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1607
case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1608
case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1609
default: assert(false, "wrong type");
1610
}
1611
break;
1612
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1613
case Op_MulReductionVI:
1614
switch (typ) {
1615
case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1616
case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1617
default: assert(false, "wrong type");
1618
}
1619
break;
1620
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1621
default: assert(false, "wrong opcode");
1622
}
1623
}
1624
1625
void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1626
XMMRegister dst, XMMRegister src,
1627
XMMRegister vtmp1, XMMRegister vtmp2) {
1628
switch (opcode) {
1629
case Op_AddReductionVF:
1630
case Op_MulReductionVF:
1631
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1632
break;
1633
1634
case Op_AddReductionVD:
1635
case Op_MulReductionVD:
1636
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1637
break;
1638
1639
default: assert(false, "wrong opcode");
1640
}
1641
}
1642
1643
void C2_MacroAssembler::reduceB(int opcode, int vlen,
1644
Register dst, Register src1, XMMRegister src2,
1645
XMMRegister vtmp1, XMMRegister vtmp2) {
1646
switch (vlen) {
1647
case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648
case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649
case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650
case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1651
1652
default: assert(false, "wrong vector length");
1653
}
1654
}
1655
1656
void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1657
Register dst, Register src1, XMMRegister src2,
1658
XMMRegister vtmp1, XMMRegister vtmp2) {
1659
switch (vlen) {
1660
case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661
case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662
case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663
case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664
1665
default: assert(false, "wrong vector length");
1666
}
1667
}
1668
1669
void C2_MacroAssembler::reduceS(int opcode, int vlen,
1670
Register dst, Register src1, XMMRegister src2,
1671
XMMRegister vtmp1, XMMRegister vtmp2) {
1672
switch (vlen) {
1673
case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674
case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675
case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676
case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1677
1678
default: assert(false, "wrong vector length");
1679
}
1680
}
1681
1682
void C2_MacroAssembler::reduceI(int opcode, int vlen,
1683
Register dst, Register src1, XMMRegister src2,
1684
XMMRegister vtmp1, XMMRegister vtmp2) {
1685
switch (vlen) {
1686
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1687
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690
1691
default: assert(false, "wrong vector length");
1692
}
1693
}
1694
1695
#ifdef _LP64
1696
void C2_MacroAssembler::reduceL(int opcode, int vlen,
1697
Register dst, Register src1, XMMRegister src2,
1698
XMMRegister vtmp1, XMMRegister vtmp2) {
1699
switch (vlen) {
1700
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1701
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1702
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703
1704
default: assert(false, "wrong vector length");
1705
}
1706
}
1707
#endif // _LP64
1708
1709
void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1710
switch (vlen) {
1711
case 2:
1712
assert(vtmp2 == xnoreg, "");
1713
reduce2F(opcode, dst, src, vtmp1);
1714
break;
1715
case 4:
1716
assert(vtmp2 == xnoreg, "");
1717
reduce4F(opcode, dst, src, vtmp1);
1718
break;
1719
case 8:
1720
reduce8F(opcode, dst, src, vtmp1, vtmp2);
1721
break;
1722
case 16:
1723
reduce16F(opcode, dst, src, vtmp1, vtmp2);
1724
break;
1725
default: assert(false, "wrong vector length");
1726
}
1727
}
1728
1729
void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1730
switch (vlen) {
1731
case 2:
1732
assert(vtmp2 == xnoreg, "");
1733
reduce2D(opcode, dst, src, vtmp1);
1734
break;
1735
case 4:
1736
reduce4D(opcode, dst, src, vtmp1, vtmp2);
1737
break;
1738
case 8:
1739
reduce8D(opcode, dst, src, vtmp1, vtmp2);
1740
break;
1741
default: assert(false, "wrong vector length");
1742
}
1743
}
1744
1745
void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746
if (opcode == Op_AddReductionVI) {
1747
if (vtmp1 != src2) {
1748
movdqu(vtmp1, src2);
1749
}
1750
phaddd(vtmp1, vtmp1);
1751
} else {
1752
pshufd(vtmp1, src2, 0x1);
1753
reduce_operation_128(T_INT, opcode, vtmp1, src2);
1754
}
1755
movdl(vtmp2, src1);
1756
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1757
movdl(dst, vtmp1);
1758
}
1759
1760
void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1761
if (opcode == Op_AddReductionVI) {
1762
if (vtmp1 != src2) {
1763
movdqu(vtmp1, src2);
1764
}
1765
phaddd(vtmp1, src2);
1766
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1767
} else {
1768
pshufd(vtmp2, src2, 0xE);
1769
reduce_operation_128(T_INT, opcode, vtmp2, src2);
1770
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1771
}
1772
}
1773
1774
void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1775
if (opcode == Op_AddReductionVI) {
1776
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1777
vextracti128_high(vtmp2, vtmp1);
1778
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1779
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1780
} else {
1781
vextracti128_high(vtmp1, src2);
1782
reduce_operation_128(T_INT, opcode, vtmp1, src2);
1783
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1784
}
1785
}
1786
1787
void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788
vextracti64x4_high(vtmp2, src2);
1789
reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1790
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1791
}
1792
1793
void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1794
pshufd(vtmp2, src2, 0x1);
1795
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1796
movdqu(vtmp1, vtmp2);
1797
psrldq(vtmp1, 2);
1798
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1799
movdqu(vtmp2, vtmp1);
1800
psrldq(vtmp2, 1);
1801
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1802
movdl(vtmp2, src1);
1803
pmovsxbd(vtmp1, vtmp1);
1804
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1805
pextrb(dst, vtmp1, 0x0);
1806
movsbl(dst, dst);
1807
}
1808
1809
void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1810
pshufd(vtmp1, src2, 0xE);
1811
reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1812
reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1813
}
1814
1815
void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1816
vextracti128_high(vtmp2, src2);
1817
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1818
reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1819
}
1820
1821
void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1822
vextracti64x4_high(vtmp1, src2);
1823
reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1824
reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1825
}
1826
1827
void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1828
pmovsxbw(vtmp2, src2);
1829
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1830
}
1831
1832
void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1833
if (UseAVX > 1) {
1834
int vector_len = Assembler::AVX_256bit;
1835
vpmovsxbw(vtmp1, src2, vector_len);
1836
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1837
} else {
1838
pmovsxbw(vtmp2, src2);
1839
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1840
pshufd(vtmp2, src2, 0x1);
1841
pmovsxbw(vtmp2, src2);
1842
reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1843
}
1844
}
1845
1846
void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847
if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1848
int vector_len = Assembler::AVX_512bit;
1849
vpmovsxbw(vtmp1, src2, vector_len);
1850
reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1851
} else {
1852
assert(UseAVX >= 2,"Should not reach here.");
1853
mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1854
vextracti128_high(vtmp2, src2);
1855
mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1856
}
1857
}
1858
1859
void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1860
mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1861
vextracti64x4_high(vtmp2, src2);
1862
mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1863
}
1864
1865
void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1866
if (opcode == Op_AddReductionVI) {
1867
if (vtmp1 != src2) {
1868
movdqu(vtmp1, src2);
1869
}
1870
phaddw(vtmp1, vtmp1);
1871
phaddw(vtmp1, vtmp1);
1872
} else {
1873
pshufd(vtmp2, src2, 0x1);
1874
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1875
movdqu(vtmp1, vtmp2);
1876
psrldq(vtmp1, 2);
1877
reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1878
}
1879
movdl(vtmp2, src1);
1880
pmovsxwd(vtmp1, vtmp1);
1881
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1882
pextrw(dst, vtmp1, 0x0);
1883
movswl(dst, dst);
1884
}
1885
1886
void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1887
if (opcode == Op_AddReductionVI) {
1888
if (vtmp1 != src2) {
1889
movdqu(vtmp1, src2);
1890
}
1891
phaddw(vtmp1, src2);
1892
} else {
1893
pshufd(vtmp1, src2, 0xE);
1894
reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1895
}
1896
reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1897
}
1898
1899
void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1900
if (opcode == Op_AddReductionVI) {
1901
int vector_len = Assembler::AVX_256bit;
1902
vphaddw(vtmp2, src2, src2, vector_len);
1903
vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1904
} else {
1905
vextracti128_high(vtmp2, src2);
1906
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1907
}
1908
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1909
}
1910
1911
void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912
int vector_len = Assembler::AVX_256bit;
1913
vextracti64x4_high(vtmp1, src2);
1914
reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1915
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1916
}
1917
1918
#ifdef _LP64
1919
void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1920
pshufd(vtmp2, src2, 0xE);
1921
reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1922
movdq(vtmp1, src1);
1923
reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1924
movdq(dst, vtmp1);
1925
}
1926
1927
void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1928
vextracti128_high(vtmp1, src2);
1929
reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1930
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1931
}
1932
1933
void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934
vextracti64x4_high(vtmp2, src2);
1935
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1936
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1937
}
1938
1939
void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1940
assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1941
mov64(temp, -1L);
1942
bzhiq(temp, temp, len);
1943
kmovql(dst, temp);
1944
}
1945
#endif // _LP64
1946
1947
void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1948
reduce_operation_128(T_FLOAT, opcode, dst, src);
1949
pshufd(vtmp, src, 0x1);
1950
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1951
}
1952
1953
void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1954
reduce2F(opcode, dst, src, vtmp);
1955
pshufd(vtmp, src, 0x2);
1956
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1957
pshufd(vtmp, src, 0x3);
1958
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1959
}
1960
1961
void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1962
reduce4F(opcode, dst, src, vtmp2);
1963
vextractf128_high(vtmp2, src);
1964
reduce4F(opcode, dst, vtmp2, vtmp1);
1965
}
1966
1967
void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1968
reduce8F(opcode, dst, src, vtmp1, vtmp2);
1969
vextracti64x4_high(vtmp1, src);
1970
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1971
}
1972
1973
void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1974
reduce_operation_128(T_DOUBLE, opcode, dst, src);
1975
pshufd(vtmp, src, 0xE);
1976
reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1977
}
1978
1979
void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980
reduce2D(opcode, dst, src, vtmp2);
1981
vextractf128_high(vtmp2, src);
1982
reduce2D(opcode, dst, vtmp2, vtmp1);
1983
}
1984
1985
void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986
reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987
vextracti64x4_high(vtmp1, src);
1988
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1989
}
1990
1991
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1992
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1993
}
1994
1995
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1996
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1997
}
1998
1999
2000
void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2001
XMMRegister dst, XMMRegister src,
2002
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2003
XMMRegister xmm_0, XMMRegister xmm_1) {
2004
int permconst[] = {1, 14};
2005
XMMRegister wsrc = src;
2006
XMMRegister wdst = xmm_0;
2007
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2008
2009
int vlen_enc = Assembler::AVX_128bit;
2010
if (vlen == 16) {
2011
vlen_enc = Assembler::AVX_256bit;
2012
}
2013
2014
for (int i = log2(vlen) - 1; i >=0; i--) {
2015
if (i == 0 && !is_dst_valid) {
2016
wdst = dst;
2017
}
2018
if (i == 3) {
2019
vextracti64x4_high(wtmp, wsrc);
2020
} else if (i == 2) {
2021
vextracti128_high(wtmp, wsrc);
2022
} else { // i = [0,1]
2023
vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2024
}
2025
vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2026
wsrc = wdst;
2027
vlen_enc = Assembler::AVX_128bit;
2028
}
2029
if (is_dst_valid) {
2030
vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2031
}
2032
}
2033
2034
void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2035
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2036
XMMRegister xmm_0, XMMRegister xmm_1) {
2037
XMMRegister wsrc = src;
2038
XMMRegister wdst = xmm_0;
2039
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2040
int vlen_enc = Assembler::AVX_128bit;
2041
if (vlen == 8) {
2042
vlen_enc = Assembler::AVX_256bit;
2043
}
2044
for (int i = log2(vlen) - 1; i >=0; i--) {
2045
if (i == 0 && !is_dst_valid) {
2046
wdst = dst;
2047
}
2048
if (i == 1) {
2049
vextracti128_high(wtmp, wsrc);
2050
} else if (i == 2) {
2051
vextracti64x4_high(wtmp, wsrc);
2052
} else {
2053
assert(i == 0, "%d", i);
2054
vpermilpd(wtmp, wsrc, 1, vlen_enc);
2055
}
2056
vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2057
wsrc = wdst;
2058
vlen_enc = Assembler::AVX_128bit;
2059
}
2060
if (is_dst_valid) {
2061
vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2062
}
2063
}
2064
2065
void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2066
switch (bt) {
2067
case T_BYTE: pextrb(dst, src, idx); break;
2068
case T_SHORT: pextrw(dst, src, idx); break;
2069
case T_INT: pextrd(dst, src, idx); break;
2070
case T_LONG: pextrq(dst, src, idx); break;
2071
2072
default:
2073
assert(false,"Should not reach here.");
2074
break;
2075
}
2076
}
2077
2078
XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2079
int esize = type2aelembytes(typ);
2080
int elem_per_lane = 16/esize;
2081
int lane = elemindex / elem_per_lane;
2082
int eindex = elemindex % elem_per_lane;
2083
2084
if (lane >= 2) {
2085
assert(UseAVX > 2, "required");
2086
vextractf32x4(dst, src, lane & 3);
2087
return dst;
2088
} else if (lane > 0) {
2089
assert(UseAVX > 0, "required");
2090
vextractf128(dst, src, lane);
2091
return dst;
2092
} else {
2093
return src;
2094
}
2095
}
2096
2097
void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2098
int esize = type2aelembytes(typ);
2099
int elem_per_lane = 16/esize;
2100
int eindex = elemindex % elem_per_lane;
2101
assert(is_integral_type(typ),"required");
2102
2103
if (eindex == 0) {
2104
if (typ == T_LONG) {
2105
movq(dst, src);
2106
} else {
2107
movdl(dst, src);
2108
if (typ == T_BYTE)
2109
movsbl(dst, dst);
2110
else if (typ == T_SHORT)
2111
movswl(dst, dst);
2112
}
2113
} else {
2114
extract(typ, dst, src, eindex);
2115
}
2116
}
2117
2118
void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2119
int esize = type2aelembytes(typ);
2120
int elem_per_lane = 16/esize;
2121
int eindex = elemindex % elem_per_lane;
2122
assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2123
2124
if (eindex == 0) {
2125
movq(dst, src);
2126
} else {
2127
if (typ == T_FLOAT) {
2128
if (UseAVX == 0) {
2129
movdqu(dst, src);
2130
pshufps(dst, dst, eindex);
2131
} else {
2132
vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2133
}
2134
} else {
2135
if (UseAVX == 0) {
2136
movdqu(dst, src);
2137
psrldq(dst, eindex*esize);
2138
} else {
2139
vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2140
}
2141
movq(dst, dst);
2142
}
2143
}
2144
// Zero upper bits
2145
if (typ == T_FLOAT) {
2146
if (UseAVX == 0) {
2147
assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2148
movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2149
pand(dst, vtmp);
2150
} else {
2151
assert((tmp != noreg), "required.");
2152
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2153
}
2154
}
2155
}
2156
2157
void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2158
switch(typ) {
2159
case T_BYTE:
2160
case T_BOOLEAN:
2161
evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2162
break;
2163
case T_SHORT:
2164
case T_CHAR:
2165
evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2166
break;
2167
case T_INT:
2168
case T_FLOAT:
2169
evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2170
break;
2171
case T_LONG:
2172
case T_DOUBLE:
2173
evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2174
break;
2175
default:
2176
assert(false,"Should not reach here.");
2177
break;
2178
}
2179
}
2180
2181
void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2182
switch(typ) {
2183
case T_BOOLEAN:
2184
case T_BYTE:
2185
evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2186
break;
2187
case T_CHAR:
2188
case T_SHORT:
2189
evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2190
break;
2191
case T_INT:
2192
case T_FLOAT:
2193
evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2194
break;
2195
case T_LONG:
2196
case T_DOUBLE:
2197
evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2198
break;
2199
default:
2200
assert(false,"Should not reach here.");
2201
break;
2202
}
2203
}
2204
2205
void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2206
int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2207
int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2208
switch (typ) {
2209
case T_BYTE:
2210
vpmovzxbw(vtmp1, src1, vlen_enc);
2211
vpmovzxbw(vtmp2, src2, vlen_enc);
2212
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2213
vpacksswb(dst, dst, dst, vlen_enc);
2214
break;
2215
case T_SHORT:
2216
vpmovzxwd(vtmp1, src1, vlen_enc);
2217
vpmovzxwd(vtmp2, src2, vlen_enc);
2218
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2219
vpackssdw(dst, dst, dst, vlen_enc);
2220
break;
2221
case T_INT:
2222
vpmovzxdq(vtmp1, src1, vlen_enc);
2223
vpmovzxdq(vtmp2, src2, vlen_enc);
2224
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2225
vpermilps(dst, dst, 8, vlen_enc);
2226
break;
2227
default:
2228
assert(false, "Should not reach here");
2229
}
2230
if (vlen_in_bytes == 16) {
2231
vpermpd(dst, dst, 0x8, vlen_enc);
2232
}
2233
}
2234
2235
void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2236
XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2237
int vlen_enc = vector_length_encoding(vlen_in_bytes);
2238
switch (typ) {
2239
case T_BYTE:
2240
vpmovzxbw(vtmp1, src1, vlen_enc);
2241
vpmovzxbw(vtmp2, src2, vlen_enc);
2242
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2243
vextracti128(vtmp1, src1, 1);
2244
vextracti128(vtmp2, src2, 1);
2245
vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2246
vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2247
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2248
vpacksswb(dst, dst, vtmp3, vlen_enc);
2249
vpermpd(dst, dst, 0xd8, vlen_enc);
2250
break;
2251
case T_SHORT:
2252
vpmovzxwd(vtmp1, src1, vlen_enc);
2253
vpmovzxwd(vtmp2, src2, vlen_enc);
2254
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2255
vextracti128(vtmp1, src1, 1);
2256
vextracti128(vtmp2, src2, 1);
2257
vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2258
vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2259
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2260
vpackssdw(dst, dst, vtmp3, vlen_enc);
2261
vpermpd(dst, dst, 0xd8, vlen_enc);
2262
break;
2263
case T_INT:
2264
vpmovzxdq(vtmp1, src1, vlen_enc);
2265
vpmovzxdq(vtmp2, src2, vlen_enc);
2266
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2267
vpshufd(dst, dst, 8, vlen_enc);
2268
vpermq(dst, dst, 8, vlen_enc);
2269
vextracti128(vtmp1, src1, 1);
2270
vextracti128(vtmp2, src2, 1);
2271
vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2272
vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2273
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2274
vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2275
vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2276
vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2277
break;
2278
default:
2279
assert(false, "Should not reach here");
2280
}
2281
}
2282
2283
void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2284
switch(typ) {
2285
case T_BYTE:
2286
evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2287
break;
2288
case T_SHORT:
2289
evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2290
break;
2291
case T_INT:
2292
case T_FLOAT:
2293
evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2294
break;
2295
case T_LONG:
2296
case T_DOUBLE:
2297
evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2298
break;
2299
default:
2300
assert(false,"Should not reach here.");
2301
break;
2302
}
2303
}
2304
2305
void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2306
XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2307
switch(vlen) {
2308
case 4:
2309
assert(vtmp1 != xnoreg, "required.");
2310
// Broadcast lower 32 bits to 128 bits before ptest
2311
pshufd(vtmp1, src1, 0x0);
2312
if (bt == BoolTest::overflow) {
2313
assert(vtmp2 != xnoreg, "required.");
2314
pshufd(vtmp2, src2, 0x0);
2315
} else {
2316
assert(vtmp2 == xnoreg, "required.");
2317
vtmp2 = src2;
2318
}
2319
ptest(vtmp1, vtmp2);
2320
break;
2321
case 8:
2322
assert(vtmp1 != xnoreg, "required.");
2323
// Broadcast lower 64 bits to 128 bits before ptest
2324
pshufd(vtmp1, src1, 0x4);
2325
if (bt == BoolTest::overflow) {
2326
assert(vtmp2 != xnoreg, "required.");
2327
pshufd(vtmp2, src2, 0x4);
2328
} else {
2329
assert(vtmp2 == xnoreg, "required.");
2330
vtmp2 = src2;
2331
}
2332
ptest(vtmp1, vtmp2);
2333
break;
2334
case 16:
2335
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2336
ptest(src1, src2);
2337
break;
2338
case 32:
2339
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2340
vptest(src1, src2, Assembler::AVX_256bit);
2341
break;
2342
case 64:
2343
{
2344
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2345
evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2346
if (bt == BoolTest::ne) {
2347
ktestql(mask, mask);
2348
} else {
2349
assert(bt == BoolTest::overflow, "required");
2350
kortestql(mask, mask);
2351
}
2352
}
2353
break;
2354
default:
2355
assert(false,"Should not reach here.");
2356
break;
2357
}
2358
}
2359
2360
//-------------------------------------------------------------------------------------------
2361
2362
// IndexOf for constant substrings with size >= 8 chars
2363
// which don't need to be loaded through stack.
2364
void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2365
Register cnt1, Register cnt2,
2366
int int_cnt2, Register result,
2367
XMMRegister vec, Register tmp,
2368
int ae) {
2369
ShortBranchVerifier sbv(this);
2370
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2371
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2372
2373
// This method uses the pcmpestri instruction with bound registers
2374
// inputs:
2375
// xmm - substring
2376
// rax - substring length (elements count)
2377
// mem - scanned string
2378
// rdx - string length (elements count)
2379
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2380
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2381
// outputs:
2382
// rcx - matched index in string
2383
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2384
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2385
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2386
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2387
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2388
2389
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2390
RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2391
MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2392
2393
// Note, inline_string_indexOf() generates checks:
2394
// if (substr.count > string.count) return -1;
2395
// if (substr.count == 0) return 0;
2396
assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2397
2398
// Load substring.
2399
if (ae == StrIntrinsicNode::UL) {
2400
pmovzxbw(vec, Address(str2, 0));
2401
} else {
2402
movdqu(vec, Address(str2, 0));
2403
}
2404
movl(cnt2, int_cnt2);
2405
movptr(result, str1); // string addr
2406
2407
if (int_cnt2 > stride) {
2408
jmpb(SCAN_TO_SUBSTR);
2409
2410
// Reload substr for rescan, this code
2411
// is executed only for large substrings (> 8 chars)
2412
bind(RELOAD_SUBSTR);
2413
if (ae == StrIntrinsicNode::UL) {
2414
pmovzxbw(vec, Address(str2, 0));
2415
} else {
2416
movdqu(vec, Address(str2, 0));
2417
}
2418
negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2419
2420
bind(RELOAD_STR);
2421
// We came here after the beginning of the substring was
2422
// matched but the rest of it was not so we need to search
2423
// again. Start from the next element after the previous match.
2424
2425
// cnt2 is number of substring reminding elements and
2426
// cnt1 is number of string reminding elements when cmp failed.
2427
// Restored cnt1 = cnt1 - cnt2 + int_cnt2
2428
subl(cnt1, cnt2);
2429
addl(cnt1, int_cnt2);
2430
movl(cnt2, int_cnt2); // Now restore cnt2
2431
2432
decrementl(cnt1); // Shift to next element
2433
cmpl(cnt1, cnt2);
2434
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2435
2436
addptr(result, (1<<scale1));
2437
2438
} // (int_cnt2 > 8)
2439
2440
// Scan string for start of substr in 16-byte vectors
2441
bind(SCAN_TO_SUBSTR);
2442
pcmpestri(vec, Address(result, 0), mode);
2443
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2444
subl(cnt1, stride);
2445
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2446
cmpl(cnt1, cnt2);
2447
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2448
addptr(result, 16);
2449
jmpb(SCAN_TO_SUBSTR);
2450
2451
// Found a potential substr
2452
bind(FOUND_CANDIDATE);
2453
// Matched whole vector if first element matched (tmp(rcx) == 0).
2454
if (int_cnt2 == stride) {
2455
jccb(Assembler::overflow, RET_FOUND); // OF == 1
2456
} else { // int_cnt2 > 8
2457
jccb(Assembler::overflow, FOUND_SUBSTR);
2458
}
2459
// After pcmpestri tmp(rcx) contains matched element index
2460
// Compute start addr of substr
2461
lea(result, Address(result, tmp, scale1));
2462
2463
// Make sure string is still long enough
2464
subl(cnt1, tmp);
2465
cmpl(cnt1, cnt2);
2466
if (int_cnt2 == stride) {
2467
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2468
} else { // int_cnt2 > 8
2469
jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2470
}
2471
// Left less then substring.
2472
2473
bind(RET_NOT_FOUND);
2474
movl(result, -1);
2475
jmp(EXIT);
2476
2477
if (int_cnt2 > stride) {
2478
// This code is optimized for the case when whole substring
2479
// is matched if its head is matched.
2480
bind(MATCH_SUBSTR_HEAD);
2481
pcmpestri(vec, Address(result, 0), mode);
2482
// Reload only string if does not match
2483
jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2484
2485
Label CONT_SCAN_SUBSTR;
2486
// Compare the rest of substring (> 8 chars).
2487
bind(FOUND_SUBSTR);
2488
// First 8 chars are already matched.
2489
negptr(cnt2);
2490
addptr(cnt2, stride);
2491
2492
bind(SCAN_SUBSTR);
2493
subl(cnt1, stride);
2494
cmpl(cnt2, -stride); // Do not read beyond substring
2495
jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2496
// Back-up strings to avoid reading beyond substring:
2497
// cnt1 = cnt1 - cnt2 + 8
2498
addl(cnt1, cnt2); // cnt2 is negative
2499
addl(cnt1, stride);
2500
movl(cnt2, stride); negptr(cnt2);
2501
bind(CONT_SCAN_SUBSTR);
2502
if (int_cnt2 < (int)G) {
2503
int tail_off1 = int_cnt2<<scale1;
2504
int tail_off2 = int_cnt2<<scale2;
2505
if (ae == StrIntrinsicNode::UL) {
2506
pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2507
} else {
2508
movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2509
}
2510
pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2511
} else {
2512
// calculate index in register to avoid integer overflow (int_cnt2*2)
2513
movl(tmp, int_cnt2);
2514
addptr(tmp, cnt2);
2515
if (ae == StrIntrinsicNode::UL) {
2516
pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2517
} else {
2518
movdqu(vec, Address(str2, tmp, scale2, 0));
2519
}
2520
pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2521
}
2522
// Need to reload strings pointers if not matched whole vector
2523
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2524
addptr(cnt2, stride);
2525
jcc(Assembler::negative, SCAN_SUBSTR);
2526
// Fall through if found full substring
2527
2528
} // (int_cnt2 > 8)
2529
2530
bind(RET_FOUND);
2531
// Found result if we matched full small substring.
2532
// Compute substr offset
2533
subptr(result, str1);
2534
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2535
shrl(result, 1); // index
2536
}
2537
bind(EXIT);
2538
2539
} // string_indexofC8
2540
2541
// Small strings are loaded through stack if they cross page boundary.
2542
void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2543
Register cnt1, Register cnt2,
2544
int int_cnt2, Register result,
2545
XMMRegister vec, Register tmp,
2546
int ae) {
2547
ShortBranchVerifier sbv(this);
2548
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2549
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2550
2551
//
2552
// int_cnt2 is length of small (< 8 chars) constant substring
2553
// or (-1) for non constant substring in which case its length
2554
// is in cnt2 register.
2555
//
2556
// Note, inline_string_indexOf() generates checks:
2557
// if (substr.count > string.count) return -1;
2558
// if (substr.count == 0) return 0;
2559
//
2560
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2561
assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2562
// This method uses the pcmpestri instruction with bound registers
2563
// inputs:
2564
// xmm - substring
2565
// rax - substring length (elements count)
2566
// mem - scanned string
2567
// rdx - string length (elements count)
2568
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2569
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2570
// outputs:
2571
// rcx - matched index in string
2572
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2573
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2574
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2575
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2576
2577
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2578
RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2579
FOUND_CANDIDATE;
2580
2581
{ //========================================================
2582
// We don't know where these strings are located
2583
// and we can't read beyond them. Load them through stack.
2584
Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2585
2586
movptr(tmp, rsp); // save old SP
2587
2588
if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2589
if (int_cnt2 == (1>>scale2)) { // One byte
2590
assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2591
load_unsigned_byte(result, Address(str2, 0));
2592
movdl(vec, result); // move 32 bits
2593
} else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2594
// Not enough header space in 32-bit VM: 12+3 = 15.
2595
movl(result, Address(str2, -1));
2596
shrl(result, 8);
2597
movdl(vec, result); // move 32 bits
2598
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2599
load_unsigned_short(result, Address(str2, 0));
2600
movdl(vec, result); // move 32 bits
2601
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2602
movdl(vec, Address(str2, 0)); // move 32 bits
2603
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2604
movq(vec, Address(str2, 0)); // move 64 bits
2605
} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2606
// Array header size is 12 bytes in 32-bit VM
2607
// + 6 bytes for 3 chars == 18 bytes,
2608
// enough space to load vec and shift.
2609
assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2610
if (ae == StrIntrinsicNode::UL) {
2611
int tail_off = int_cnt2-8;
2612
pmovzxbw(vec, Address(str2, tail_off));
2613
psrldq(vec, -2*tail_off);
2614
}
2615
else {
2616
int tail_off = int_cnt2*(1<<scale2);
2617
movdqu(vec, Address(str2, tail_off-16));
2618
psrldq(vec, 16-tail_off);
2619
}
2620
}
2621
} else { // not constant substring
2622
cmpl(cnt2, stride);
2623
jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2624
2625
// We can read beyond string if srt+16 does not cross page boundary
2626
// since heaps are aligned and mapped by pages.
2627
assert(os::vm_page_size() < (int)G, "default page should be small");
2628
movl(result, str2); // We need only low 32 bits
2629
andl(result, (os::vm_page_size()-1));
2630
cmpl(result, (os::vm_page_size()-16));
2631
jccb(Assembler::belowEqual, CHECK_STR);
2632
2633
// Move small strings to stack to allow load 16 bytes into vec.
2634
subptr(rsp, 16);
2635
int stk_offset = wordSize-(1<<scale2);
2636
push(cnt2);
2637
2638
bind(COPY_SUBSTR);
2639
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2640
load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2641
movb(Address(rsp, cnt2, scale2, stk_offset), result);
2642
} else if (ae == StrIntrinsicNode::UU) {
2643
load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2644
movw(Address(rsp, cnt2, scale2, stk_offset), result);
2645
}
2646
decrement(cnt2);
2647
jccb(Assembler::notZero, COPY_SUBSTR);
2648
2649
pop(cnt2);
2650
movptr(str2, rsp); // New substring address
2651
} // non constant
2652
2653
bind(CHECK_STR);
2654
cmpl(cnt1, stride);
2655
jccb(Assembler::aboveEqual, BIG_STRINGS);
2656
2657
// Check cross page boundary.
2658
movl(result, str1); // We need only low 32 bits
2659
andl(result, (os::vm_page_size()-1));
2660
cmpl(result, (os::vm_page_size()-16));
2661
jccb(Assembler::belowEqual, BIG_STRINGS);
2662
2663
subptr(rsp, 16);
2664
int stk_offset = -(1<<scale1);
2665
if (int_cnt2 < 0) { // not constant
2666
push(cnt2);
2667
stk_offset += wordSize;
2668
}
2669
movl(cnt2, cnt1);
2670
2671
bind(COPY_STR);
2672
if (ae == StrIntrinsicNode::LL) {
2673
load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2674
movb(Address(rsp, cnt2, scale1, stk_offset), result);
2675
} else {
2676
load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2677
movw(Address(rsp, cnt2, scale1, stk_offset), result);
2678
}
2679
decrement(cnt2);
2680
jccb(Assembler::notZero, COPY_STR);
2681
2682
if (int_cnt2 < 0) { // not constant
2683
pop(cnt2);
2684
}
2685
movptr(str1, rsp); // New string address
2686
2687
bind(BIG_STRINGS);
2688
// Load substring.
2689
if (int_cnt2 < 0) { // -1
2690
if (ae == StrIntrinsicNode::UL) {
2691
pmovzxbw(vec, Address(str2, 0));
2692
} else {
2693
movdqu(vec, Address(str2, 0));
2694
}
2695
push(cnt2); // substr count
2696
push(str2); // substr addr
2697
push(str1); // string addr
2698
} else {
2699
// Small (< 8 chars) constant substrings are loaded already.
2700
movl(cnt2, int_cnt2);
2701
}
2702
push(tmp); // original SP
2703
2704
} // Finished loading
2705
2706
//========================================================
2707
// Start search
2708
//
2709
2710
movptr(result, str1); // string addr
2711
2712
if (int_cnt2 < 0) { // Only for non constant substring
2713
jmpb(SCAN_TO_SUBSTR);
2714
2715
// SP saved at sp+0
2716
// String saved at sp+1*wordSize
2717
// Substr saved at sp+2*wordSize
2718
// Substr count saved at sp+3*wordSize
2719
2720
// Reload substr for rescan, this code
2721
// is executed only for large substrings (> 8 chars)
2722
bind(RELOAD_SUBSTR);
2723
movptr(str2, Address(rsp, 2*wordSize));
2724
movl(cnt2, Address(rsp, 3*wordSize));
2725
if (ae == StrIntrinsicNode::UL) {
2726
pmovzxbw(vec, Address(str2, 0));
2727
} else {
2728
movdqu(vec, Address(str2, 0));
2729
}
2730
// We came here after the beginning of the substring was
2731
// matched but the rest of it was not so we need to search
2732
// again. Start from the next element after the previous match.
2733
subptr(str1, result); // Restore counter
2734
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2735
shrl(str1, 1);
2736
}
2737
addl(cnt1, str1);
2738
decrementl(cnt1); // Shift to next element
2739
cmpl(cnt1, cnt2);
2740
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2741
2742
addptr(result, (1<<scale1));
2743
} // non constant
2744
2745
// Scan string for start of substr in 16-byte vectors
2746
bind(SCAN_TO_SUBSTR);
2747
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2748
pcmpestri(vec, Address(result, 0), mode);
2749
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2750
subl(cnt1, stride);
2751
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2752
cmpl(cnt1, cnt2);
2753
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2754
addptr(result, 16);
2755
2756
bind(ADJUST_STR);
2757
cmpl(cnt1, stride); // Do not read beyond string
2758
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2759
// Back-up string to avoid reading beyond string.
2760
lea(result, Address(result, cnt1, scale1, -16));
2761
movl(cnt1, stride);
2762
jmpb(SCAN_TO_SUBSTR);
2763
2764
// Found a potential substr
2765
bind(FOUND_CANDIDATE);
2766
// After pcmpestri tmp(rcx) contains matched element index
2767
2768
// Make sure string is still long enough
2769
subl(cnt1, tmp);
2770
cmpl(cnt1, cnt2);
2771
jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2772
// Left less then substring.
2773
2774
bind(RET_NOT_FOUND);
2775
movl(result, -1);
2776
jmp(CLEANUP);
2777
2778
bind(FOUND_SUBSTR);
2779
// Compute start addr of substr
2780
lea(result, Address(result, tmp, scale1));
2781
if (int_cnt2 > 0) { // Constant substring
2782
// Repeat search for small substring (< 8 chars)
2783
// from new point without reloading substring.
2784
// Have to check that we don't read beyond string.
2785
cmpl(tmp, stride-int_cnt2);
2786
jccb(Assembler::greater, ADJUST_STR);
2787
// Fall through if matched whole substring.
2788
} else { // non constant
2789
assert(int_cnt2 == -1, "should be != 0");
2790
2791
addl(tmp, cnt2);
2792
// Found result if we matched whole substring.
2793
cmpl(tmp, stride);
2794
jcc(Assembler::lessEqual, RET_FOUND);
2795
2796
// Repeat search for small substring (<= 8 chars)
2797
// from new point 'str1' without reloading substring.
2798
cmpl(cnt2, stride);
2799
// Have to check that we don't read beyond string.
2800
jccb(Assembler::lessEqual, ADJUST_STR);
2801
2802
Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2803
// Compare the rest of substring (> 8 chars).
2804
movptr(str1, result);
2805
2806
cmpl(tmp, cnt2);
2807
// First 8 chars are already matched.
2808
jccb(Assembler::equal, CHECK_NEXT);
2809
2810
bind(SCAN_SUBSTR);
2811
pcmpestri(vec, Address(str1, 0), mode);
2812
// Need to reload strings pointers if not matched whole vector
2813
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2814
2815
bind(CHECK_NEXT);
2816
subl(cnt2, stride);
2817
jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2818
addptr(str1, 16);
2819
if (ae == StrIntrinsicNode::UL) {
2820
addptr(str2, 8);
2821
} else {
2822
addptr(str2, 16);
2823
}
2824
subl(cnt1, stride);
2825
cmpl(cnt2, stride); // Do not read beyond substring
2826
jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2827
// Back-up strings to avoid reading beyond substring.
2828
2829
if (ae == StrIntrinsicNode::UL) {
2830
lea(str2, Address(str2, cnt2, scale2, -8));
2831
lea(str1, Address(str1, cnt2, scale1, -16));
2832
} else {
2833
lea(str2, Address(str2, cnt2, scale2, -16));
2834
lea(str1, Address(str1, cnt2, scale1, -16));
2835
}
2836
subl(cnt1, cnt2);
2837
movl(cnt2, stride);
2838
addl(cnt1, stride);
2839
bind(CONT_SCAN_SUBSTR);
2840
if (ae == StrIntrinsicNode::UL) {
2841
pmovzxbw(vec, Address(str2, 0));
2842
} else {
2843
movdqu(vec, Address(str2, 0));
2844
}
2845
jmp(SCAN_SUBSTR);
2846
2847
bind(RET_FOUND_LONG);
2848
movptr(str1, Address(rsp, wordSize));
2849
} // non constant
2850
2851
bind(RET_FOUND);
2852
// Compute substr offset
2853
subptr(result, str1);
2854
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2855
shrl(result, 1); // index
2856
}
2857
bind(CLEANUP);
2858
pop(rsp); // restore SP
2859
2860
} // string_indexof
2861
2862
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2863
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2864
ShortBranchVerifier sbv(this);
2865
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2866
2867
int stride = 8;
2868
2869
Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2870
SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2871
RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2872
FOUND_SEQ_CHAR, DONE_LABEL;
2873
2874
movptr(result, str1);
2875
if (UseAVX >= 2) {
2876
cmpl(cnt1, stride);
2877
jcc(Assembler::less, SCAN_TO_CHAR);
2878
cmpl(cnt1, 2*stride);
2879
jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2880
movdl(vec1, ch);
2881
vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2882
vpxor(vec2, vec2);
2883
movl(tmp, cnt1);
2884
andl(tmp, 0xFFFFFFF0); //vector count (in chars)
2885
andl(cnt1,0x0000000F); //tail count (in chars)
2886
2887
bind(SCAN_TO_16_CHAR_LOOP);
2888
vmovdqu(vec3, Address(result, 0));
2889
vpcmpeqw(vec3, vec3, vec1, 1);
2890
vptest(vec2, vec3);
2891
jcc(Assembler::carryClear, FOUND_CHAR);
2892
addptr(result, 32);
2893
subl(tmp, 2*stride);
2894
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2895
jmp(SCAN_TO_8_CHAR);
2896
bind(SCAN_TO_8_CHAR_INIT);
2897
movdl(vec1, ch);
2898
pshuflw(vec1, vec1, 0x00);
2899
pshufd(vec1, vec1, 0);
2900
pxor(vec2, vec2);
2901
}
2902
bind(SCAN_TO_8_CHAR);
2903
cmpl(cnt1, stride);
2904
jcc(Assembler::less, SCAN_TO_CHAR);
2905
if (UseAVX < 2) {
2906
movdl(vec1, ch);
2907
pshuflw(vec1, vec1, 0x00);
2908
pshufd(vec1, vec1, 0);
2909
pxor(vec2, vec2);
2910
}
2911
movl(tmp, cnt1);
2912
andl(tmp, 0xFFFFFFF8); //vector count (in chars)
2913
andl(cnt1,0x00000007); //tail count (in chars)
2914
2915
bind(SCAN_TO_8_CHAR_LOOP);
2916
movdqu(vec3, Address(result, 0));
2917
pcmpeqw(vec3, vec1);
2918
ptest(vec2, vec3);
2919
jcc(Assembler::carryClear, FOUND_CHAR);
2920
addptr(result, 16);
2921
subl(tmp, stride);
2922
jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2923
bind(SCAN_TO_CHAR);
2924
testl(cnt1, cnt1);
2925
jcc(Assembler::zero, RET_NOT_FOUND);
2926
bind(SCAN_TO_CHAR_LOOP);
2927
load_unsigned_short(tmp, Address(result, 0));
2928
cmpl(ch, tmp);
2929
jccb(Assembler::equal, FOUND_SEQ_CHAR);
2930
addptr(result, 2);
2931
subl(cnt1, 1);
2932
jccb(Assembler::zero, RET_NOT_FOUND);
2933
jmp(SCAN_TO_CHAR_LOOP);
2934
2935
bind(RET_NOT_FOUND);
2936
movl(result, -1);
2937
jmpb(DONE_LABEL);
2938
2939
bind(FOUND_CHAR);
2940
if (UseAVX >= 2) {
2941
vpmovmskb(tmp, vec3);
2942
} else {
2943
pmovmskb(tmp, vec3);
2944
}
2945
bsfl(ch, tmp);
2946
addptr(result, ch);
2947
2948
bind(FOUND_SEQ_CHAR);
2949
subptr(result, str1);
2950
shrl(result, 1);
2951
2952
bind(DONE_LABEL);
2953
} // string_indexof_char
2954
2955
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2956
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2957
ShortBranchVerifier sbv(this);
2958
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2959
2960
int stride = 16;
2961
2962
Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2963
SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2964
RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2965
FOUND_SEQ_CHAR, DONE_LABEL;
2966
2967
movptr(result, str1);
2968
if (UseAVX >= 2) {
2969
cmpl(cnt1, stride);
2970
jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2971
cmpl(cnt1, stride*2);
2972
jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2973
movdl(vec1, ch);
2974
vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2975
vpxor(vec2, vec2);
2976
movl(tmp, cnt1);
2977
andl(tmp, 0xFFFFFFE0); //vector count (in chars)
2978
andl(cnt1,0x0000001F); //tail count (in chars)
2979
2980
bind(SCAN_TO_32_CHAR_LOOP);
2981
vmovdqu(vec3, Address(result, 0));
2982
vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2983
vptest(vec2, vec3);
2984
jcc(Assembler::carryClear, FOUND_CHAR);
2985
addptr(result, 32);
2986
subl(tmp, stride*2);
2987
jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2988
jmp(SCAN_TO_16_CHAR);
2989
2990
bind(SCAN_TO_16_CHAR_INIT);
2991
movdl(vec1, ch);
2992
pxor(vec2, vec2);
2993
pshufb(vec1, vec2);
2994
}
2995
2996
bind(SCAN_TO_16_CHAR);
2997
cmpl(cnt1, stride);
2998
jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2999
if (UseAVX < 2) {
3000
movdl(vec1, ch);
3001
pxor(vec2, vec2);
3002
pshufb(vec1, vec2);
3003
}
3004
movl(tmp, cnt1);
3005
andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3006
andl(cnt1,0x0000000F); //tail count (in bytes)
3007
3008
bind(SCAN_TO_16_CHAR_LOOP);
3009
movdqu(vec3, Address(result, 0));
3010
pcmpeqb(vec3, vec1);
3011
ptest(vec2, vec3);
3012
jcc(Assembler::carryClear, FOUND_CHAR);
3013
addptr(result, 16);
3014
subl(tmp, stride);
3015
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3016
3017
bind(SCAN_TO_CHAR_INIT);
3018
testl(cnt1, cnt1);
3019
jcc(Assembler::zero, RET_NOT_FOUND);
3020
bind(SCAN_TO_CHAR_LOOP);
3021
load_unsigned_byte(tmp, Address(result, 0));
3022
cmpl(ch, tmp);
3023
jccb(Assembler::equal, FOUND_SEQ_CHAR);
3024
addptr(result, 1);
3025
subl(cnt1, 1);
3026
jccb(Assembler::zero, RET_NOT_FOUND);
3027
jmp(SCAN_TO_CHAR_LOOP);
3028
3029
bind(RET_NOT_FOUND);
3030
movl(result, -1);
3031
jmpb(DONE_LABEL);
3032
3033
bind(FOUND_CHAR);
3034
if (UseAVX >= 2) {
3035
vpmovmskb(tmp, vec3);
3036
} else {
3037
pmovmskb(tmp, vec3);
3038
}
3039
bsfl(ch, tmp);
3040
addptr(result, ch);
3041
3042
bind(FOUND_SEQ_CHAR);
3043
subptr(result, str1);
3044
3045
bind(DONE_LABEL);
3046
} // stringL_indexof_char
3047
3048
// helper function for string_compare
3049
void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3050
Address::ScaleFactor scale, Address::ScaleFactor scale1,
3051
Address::ScaleFactor scale2, Register index, int ae) {
3052
if (ae == StrIntrinsicNode::LL) {
3053
load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3054
load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3055
} else if (ae == StrIntrinsicNode::UU) {
3056
load_unsigned_short(elem1, Address(str1, index, scale, 0));
3057
load_unsigned_short(elem2, Address(str2, index, scale, 0));
3058
} else {
3059
load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3060
load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3061
}
3062
}
3063
3064
// Compare strings, used for char[] and byte[].
3065
void C2_MacroAssembler::string_compare(Register str1, Register str2,
3066
Register cnt1, Register cnt2, Register result,
3067
XMMRegister vec1, int ae, KRegister mask) {
3068
ShortBranchVerifier sbv(this);
3069
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3070
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
3071
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3072
int stride2x2 = 0x40;
3073
Address::ScaleFactor scale = Address::no_scale;
3074
Address::ScaleFactor scale1 = Address::no_scale;
3075
Address::ScaleFactor scale2 = Address::no_scale;
3076
3077
if (ae != StrIntrinsicNode::LL) {
3078
stride2x2 = 0x20;
3079
}
3080
3081
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3082
shrl(cnt2, 1);
3083
}
3084
// Compute the minimum of the string lengths and the
3085
// difference of the string lengths (stack).
3086
// Do the conditional move stuff
3087
movl(result, cnt1);
3088
subl(cnt1, cnt2);
3089
push(cnt1);
3090
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3091
3092
// Is the minimum length zero?
3093
testl(cnt2, cnt2);
3094
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3095
if (ae == StrIntrinsicNode::LL) {
3096
// Load first bytes
3097
load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3098
load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3099
} else if (ae == StrIntrinsicNode::UU) {
3100
// Load first characters
3101
load_unsigned_short(result, Address(str1, 0));
3102
load_unsigned_short(cnt1, Address(str2, 0));
3103
} else {
3104
load_unsigned_byte(result, Address(str1, 0));
3105
load_unsigned_short(cnt1, Address(str2, 0));
3106
}
3107
subl(result, cnt1);
3108
jcc(Assembler::notZero, POP_LABEL);
3109
3110
if (ae == StrIntrinsicNode::UU) {
3111
// Divide length by 2 to get number of chars
3112
shrl(cnt2, 1);
3113
}
3114
cmpl(cnt2, 1);
3115
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3116
3117
// Check if the strings start at the same location and setup scale and stride
3118
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3119
cmpptr(str1, str2);
3120
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3121
if (ae == StrIntrinsicNode::LL) {
3122
scale = Address::times_1;
3123
stride = 16;
3124
} else {
3125
scale = Address::times_2;
3126
stride = 8;
3127
}
3128
} else {
3129
scale1 = Address::times_1;
3130
scale2 = Address::times_2;
3131
// scale not used
3132
stride = 8;
3133
}
3134
3135
if (UseAVX >= 2 && UseSSE42Intrinsics) {
3136
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3137
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3138
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3139
Label COMPARE_TAIL_LONG;
3140
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
3141
3142
int pcmpmask = 0x19;
3143
if (ae == StrIntrinsicNode::LL) {
3144
pcmpmask &= ~0x01;
3145
}
3146
3147
// Setup to compare 16-chars (32-bytes) vectors,
3148
// start from first character again because it has aligned address.
3149
if (ae == StrIntrinsicNode::LL) {
3150
stride2 = 32;
3151
} else {
3152
stride2 = 16;
3153
}
3154
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3155
adr_stride = stride << scale;
3156
} else {
3157
adr_stride1 = 8; //stride << scale1;
3158
adr_stride2 = 16; //stride << scale2;
3159
}
3160
3161
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3162
// rax and rdx are used by pcmpestri as elements counters
3163
movl(result, cnt2);
3164
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3165
jcc(Assembler::zero, COMPARE_TAIL_LONG);
3166
3167
// fast path : compare first 2 8-char vectors.
3168
bind(COMPARE_16_CHARS);
3169
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3170
movdqu(vec1, Address(str1, 0));
3171
} else {
3172
pmovzxbw(vec1, Address(str1, 0));
3173
}
3174
pcmpestri(vec1, Address(str2, 0), pcmpmask);
3175
jccb(Assembler::below, COMPARE_INDEX_CHAR);
3176
3177
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3178
movdqu(vec1, Address(str1, adr_stride));
3179
pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3180
} else {
3181
pmovzxbw(vec1, Address(str1, adr_stride1));
3182
pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3183
}
3184
jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3185
addl(cnt1, stride);
3186
3187
// Compare the characters at index in cnt1
3188
bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3189
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3190
subl(result, cnt2);
3191
jmp(POP_LABEL);
3192
3193
// Setup the registers to start vector comparison loop
3194
bind(COMPARE_WIDE_VECTORS);
3195
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3196
lea(str1, Address(str1, result, scale));
3197
lea(str2, Address(str2, result, scale));
3198
} else {
3199
lea(str1, Address(str1, result, scale1));
3200
lea(str2, Address(str2, result, scale2));
3201
}
3202
subl(result, stride2);
3203
subl(cnt2, stride2);
3204
jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3205
negptr(result);
3206
3207
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3208
bind(COMPARE_WIDE_VECTORS_LOOP);
3209
3210
#ifdef _LP64
3211
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3212
cmpl(cnt2, stride2x2);
3213
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3214
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3215
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3216
3217
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3218
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3219
evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3220
evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3221
} else {
3222
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3223
evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3224
}
3225
kortestql(mask, mask);
3226
jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3227
addptr(result, stride2x2); // update since we already compared at this addr
3228
subl(cnt2, stride2x2); // and sub the size too
3229
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3230
3231
vpxor(vec1, vec1);
3232
jmpb(COMPARE_WIDE_TAIL);
3233
}//if (VM_Version::supports_avx512vlbw())
3234
#endif // _LP64
3235
3236
3237
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3238
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3239
vmovdqu(vec1, Address(str1, result, scale));
3240
vpxor(vec1, Address(str2, result, scale));
3241
} else {
3242
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3243
vpxor(vec1, Address(str2, result, scale2));
3244
}
3245
vptest(vec1, vec1);
3246
jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3247
addptr(result, stride2);
3248
subl(cnt2, stride2);
3249
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3250
// clean upper bits of YMM registers
3251
vpxor(vec1, vec1);
3252
3253
// compare wide vectors tail
3254
bind(COMPARE_WIDE_TAIL);
3255
testptr(result, result);
3256
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3257
3258
movl(result, stride2);
3259
movl(cnt2, result);
3260
negptr(result);
3261
jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3262
3263
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3264
bind(VECTOR_NOT_EQUAL);
3265
// clean upper bits of YMM registers
3266
vpxor(vec1, vec1);
3267
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3268
lea(str1, Address(str1, result, scale));
3269
lea(str2, Address(str2, result, scale));
3270
} else {
3271
lea(str1, Address(str1, result, scale1));
3272
lea(str2, Address(str2, result, scale2));
3273
}
3274
jmp(COMPARE_16_CHARS);
3275
3276
// Compare tail chars, length between 1 to 15 chars
3277
bind(COMPARE_TAIL_LONG);
3278
movl(cnt2, result);
3279
cmpl(cnt2, stride);
3280
jcc(Assembler::less, COMPARE_SMALL_STR);
3281
3282
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3283
movdqu(vec1, Address(str1, 0));
3284
} else {
3285
pmovzxbw(vec1, Address(str1, 0));
3286
}
3287
pcmpestri(vec1, Address(str2, 0), pcmpmask);
3288
jcc(Assembler::below, COMPARE_INDEX_CHAR);
3289
subptr(cnt2, stride);
3290
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3291
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3292
lea(str1, Address(str1, result, scale));
3293
lea(str2, Address(str2, result, scale));
3294
} else {
3295
lea(str1, Address(str1, result, scale1));
3296
lea(str2, Address(str2, result, scale2));
3297
}
3298
negptr(cnt2);
3299
jmpb(WHILE_HEAD_LABEL);
3300
3301
bind(COMPARE_SMALL_STR);
3302
} else if (UseSSE42Intrinsics) {
3303
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3304
int pcmpmask = 0x19;
3305
// Setup to compare 8-char (16-byte) vectors,
3306
// start from first character again because it has aligned address.
3307
movl(result, cnt2);
3308
andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3309
if (ae == StrIntrinsicNode::LL) {
3310
pcmpmask &= ~0x01;
3311
}
3312
jcc(Assembler::zero, COMPARE_TAIL);
3313
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3314
lea(str1, Address(str1, result, scale));
3315
lea(str2, Address(str2, result, scale));
3316
} else {
3317
lea(str1, Address(str1, result, scale1));
3318
lea(str2, Address(str2, result, scale2));
3319
}
3320
negptr(result);
3321
3322
// pcmpestri
3323
// inputs:
3324
// vec1- substring
3325
// rax - negative string length (elements count)
3326
// mem - scanned string
3327
// rdx - string length (elements count)
3328
// pcmpmask - cmp mode: 11000 (string compare with negated result)
3329
// + 00 (unsigned bytes) or + 01 (unsigned shorts)
3330
// outputs:
3331
// rcx - first mismatched element index
3332
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3333
3334
bind(COMPARE_WIDE_VECTORS);
3335
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3336
movdqu(vec1, Address(str1, result, scale));
3337
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3338
} else {
3339
pmovzxbw(vec1, Address(str1, result, scale1));
3340
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3341
}
3342
// After pcmpestri cnt1(rcx) contains mismatched element index
3343
3344
jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3345
addptr(result, stride);
3346
subptr(cnt2, stride);
3347
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3348
3349
// compare wide vectors tail
3350
testptr(result, result);
3351
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3352
3353
movl(cnt2, stride);
3354
movl(result, stride);
3355
negptr(result);
3356
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3357
movdqu(vec1, Address(str1, result, scale));
3358
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3359
} else {
3360
pmovzxbw(vec1, Address(str1, result, scale1));
3361
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3362
}
3363
jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3364
3365
// Mismatched characters in the vectors
3366
bind(VECTOR_NOT_EQUAL);
3367
addptr(cnt1, result);
3368
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3369
subl(result, cnt2);
3370
jmpb(POP_LABEL);
3371
3372
bind(COMPARE_TAIL); // limit is zero
3373
movl(cnt2, result);
3374
// Fallthru to tail compare
3375
}
3376
// Shift str2 and str1 to the end of the arrays, negate min
3377
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3378
lea(str1, Address(str1, cnt2, scale));
3379
lea(str2, Address(str2, cnt2, scale));
3380
} else {
3381
lea(str1, Address(str1, cnt2, scale1));
3382
lea(str2, Address(str2, cnt2, scale2));
3383
}
3384
decrementl(cnt2); // first character was compared already
3385
negptr(cnt2);
3386
3387
// Compare the rest of the elements
3388
bind(WHILE_HEAD_LABEL);
3389
load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3390
subl(result, cnt1);
3391
jccb(Assembler::notZero, POP_LABEL);
3392
increment(cnt2);
3393
jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3394
3395
// Strings are equal up to min length. Return the length difference.
3396
bind(LENGTH_DIFF_LABEL);
3397
pop(result);
3398
if (ae == StrIntrinsicNode::UU) {
3399
// Divide diff by 2 to get number of chars
3400
sarl(result, 1);
3401
}
3402
jmpb(DONE_LABEL);
3403
3404
#ifdef _LP64
3405
if (VM_Version::supports_avx512vlbw()) {
3406
3407
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3408
3409
kmovql(cnt1, mask);
3410
notq(cnt1);
3411
bsfq(cnt2, cnt1);
3412
if (ae != StrIntrinsicNode::LL) {
3413
// Divide diff by 2 to get number of chars
3414
sarl(cnt2, 1);
3415
}
3416
addq(result, cnt2);
3417
if (ae == StrIntrinsicNode::LL) {
3418
load_unsigned_byte(cnt1, Address(str2, result));
3419
load_unsigned_byte(result, Address(str1, result));
3420
} else if (ae == StrIntrinsicNode::UU) {
3421
load_unsigned_short(cnt1, Address(str2, result, scale));
3422
load_unsigned_short(result, Address(str1, result, scale));
3423
} else {
3424
load_unsigned_short(cnt1, Address(str2, result, scale2));
3425
load_unsigned_byte(result, Address(str1, result, scale1));
3426
}
3427
subl(result, cnt1);
3428
jmpb(POP_LABEL);
3429
}//if (VM_Version::supports_avx512vlbw())
3430
#endif // _LP64
3431
3432
// Discard the stored length difference
3433
bind(POP_LABEL);
3434
pop(cnt1);
3435
3436
// That's it
3437
bind(DONE_LABEL);
3438
if(ae == StrIntrinsicNode::UL) {
3439
negl(result);
3440
}
3441
3442
}
3443
3444
// Search for Non-ASCII character (Negative byte value) in a byte array,
3445
// return true if it has any and false otherwise.
3446
// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3447
// @IntrinsicCandidate
3448
// private static boolean hasNegatives(byte[] ba, int off, int len) {
3449
// for (int i = off; i < off + len; i++) {
3450
// if (ba[i] < 0) {
3451
// return true;
3452
// }
3453
// }
3454
// return false;
3455
// }
3456
void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3457
Register result, Register tmp1,
3458
XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3459
// rsi: byte array
3460
// rcx: len
3461
// rax: result
3462
ShortBranchVerifier sbv(this);
3463
assert_different_registers(ary1, len, result, tmp1);
3464
assert_different_registers(vec1, vec2);
3465
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3466
3467
// len == 0
3468
testl(len, len);
3469
jcc(Assembler::zero, FALSE_LABEL);
3470
3471
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3472
VM_Version::supports_avx512vlbw() &&
3473
VM_Version::supports_bmi2()) {
3474
3475
Label test_64_loop, test_tail;
3476
Register tmp3_aliased = len;
3477
3478
movl(tmp1, len);
3479
vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3480
3481
andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
3482
andl(len, ~(64 - 1)); // vector count (in chars)
3483
jccb(Assembler::zero, test_tail);
3484
3485
lea(ary1, Address(ary1, len, Address::times_1));
3486
negptr(len);
3487
3488
bind(test_64_loop);
3489
// Check whether our 64 elements of size byte contain negatives
3490
evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3491
kortestql(mask1, mask1);
3492
jcc(Assembler::notZero, TRUE_LABEL);
3493
3494
addptr(len, 64);
3495
jccb(Assembler::notZero, test_64_loop);
3496
3497
3498
bind(test_tail);
3499
// bail out when there is nothing to be done
3500
testl(tmp1, -1);
3501
jcc(Assembler::zero, FALSE_LABEL);
3502
3503
// ~(~0 << len) applied up to two times (for 32-bit scenario)
3504
#ifdef _LP64
3505
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3506
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3507
notq(tmp3_aliased);
3508
kmovql(mask2, tmp3_aliased);
3509
#else
3510
Label k_init;
3511
jmp(k_init);
3512
3513
// We could not read 64-bits from a general purpose register thus we move
3514
// data required to compose 64 1's to the instruction stream
3515
// We emit 64 byte wide series of elements from 0..63 which later on would
3516
// be used as a compare targets with tail count contained in tmp1 register.
3517
// Result would be a k register having tmp1 consecutive number or 1
3518
// counting from least significant bit.
3519
address tmp = pc();
3520
emit_int64(0x0706050403020100);
3521
emit_int64(0x0F0E0D0C0B0A0908);
3522
emit_int64(0x1716151413121110);
3523
emit_int64(0x1F1E1D1C1B1A1918);
3524
emit_int64(0x2726252423222120);
3525
emit_int64(0x2F2E2D2C2B2A2928);
3526
emit_int64(0x3736353433323130);
3527
emit_int64(0x3F3E3D3C3B3A3938);
3528
3529
bind(k_init);
3530
lea(len, InternalAddress(tmp));
3531
// create mask to test for negative byte inside a vector
3532
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3533
evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3534
3535
#endif
3536
evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3537
ktestq(mask1, mask2);
3538
jcc(Assembler::notZero, TRUE_LABEL);
3539
3540
jmp(FALSE_LABEL);
3541
} else {
3542
movl(result, len); // copy
3543
3544
if (UseAVX >= 2 && UseSSE >= 2) {
3545
// With AVX2, use 32-byte vector compare
3546
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3547
3548
// Compare 32-byte vectors
3549
andl(result, 0x0000001f); // tail count (in bytes)
3550
andl(len, 0xffffffe0); // vector count (in bytes)
3551
jccb(Assembler::zero, COMPARE_TAIL);
3552
3553
lea(ary1, Address(ary1, len, Address::times_1));
3554
negptr(len);
3555
3556
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
3557
movdl(vec2, tmp1);
3558
vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3559
3560
bind(COMPARE_WIDE_VECTORS);
3561
vmovdqu(vec1, Address(ary1, len, Address::times_1));
3562
vptest(vec1, vec2);
3563
jccb(Assembler::notZero, TRUE_LABEL);
3564
addptr(len, 32);
3565
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3566
3567
testl(result, result);
3568
jccb(Assembler::zero, FALSE_LABEL);
3569
3570
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3571
vptest(vec1, vec2);
3572
jccb(Assembler::notZero, TRUE_LABEL);
3573
jmpb(FALSE_LABEL);
3574
3575
bind(COMPARE_TAIL); // len is zero
3576
movl(len, result);
3577
// Fallthru to tail compare
3578
} else if (UseSSE42Intrinsics) {
3579
// With SSE4.2, use double quad vector compare
3580
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3581
3582
// Compare 16-byte vectors
3583
andl(result, 0x0000000f); // tail count (in bytes)
3584
andl(len, 0xfffffff0); // vector count (in bytes)
3585
jcc(Assembler::zero, COMPARE_TAIL);
3586
3587
lea(ary1, Address(ary1, len, Address::times_1));
3588
negptr(len);
3589
3590
movl(tmp1, 0x80808080);
3591
movdl(vec2, tmp1);
3592
pshufd(vec2, vec2, 0);
3593
3594
bind(COMPARE_WIDE_VECTORS);
3595
movdqu(vec1, Address(ary1, len, Address::times_1));
3596
ptest(vec1, vec2);
3597
jcc(Assembler::notZero, TRUE_LABEL);
3598
addptr(len, 16);
3599
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3600
3601
testl(result, result);
3602
jcc(Assembler::zero, FALSE_LABEL);
3603
3604
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3605
ptest(vec1, vec2);
3606
jccb(Assembler::notZero, TRUE_LABEL);
3607
jmpb(FALSE_LABEL);
3608
3609
bind(COMPARE_TAIL); // len is zero
3610
movl(len, result);
3611
// Fallthru to tail compare
3612
}
3613
}
3614
// Compare 4-byte vectors
3615
andl(len, 0xfffffffc); // vector count (in bytes)
3616
jccb(Assembler::zero, COMPARE_CHAR);
3617
3618
lea(ary1, Address(ary1, len, Address::times_1));
3619
negptr(len);
3620
3621
bind(COMPARE_VECTORS);
3622
movl(tmp1, Address(ary1, len, Address::times_1));
3623
andl(tmp1, 0x80808080);
3624
jccb(Assembler::notZero, TRUE_LABEL);
3625
addptr(len, 4);
3626
jcc(Assembler::notZero, COMPARE_VECTORS);
3627
3628
// Compare trailing char (final 2 bytes), if any
3629
bind(COMPARE_CHAR);
3630
testl(result, 0x2); // tail char
3631
jccb(Assembler::zero, COMPARE_BYTE);
3632
load_unsigned_short(tmp1, Address(ary1, 0));
3633
andl(tmp1, 0x00008080);
3634
jccb(Assembler::notZero, TRUE_LABEL);
3635
subptr(result, 2);
3636
lea(ary1, Address(ary1, 2));
3637
3638
bind(COMPARE_BYTE);
3639
testl(result, 0x1); // tail byte
3640
jccb(Assembler::zero, FALSE_LABEL);
3641
load_unsigned_byte(tmp1, Address(ary1, 0));
3642
andl(tmp1, 0x00000080);
3643
jccb(Assembler::notEqual, TRUE_LABEL);
3644
jmpb(FALSE_LABEL);
3645
3646
bind(TRUE_LABEL);
3647
movl(result, 1); // return true
3648
jmpb(DONE);
3649
3650
bind(FALSE_LABEL);
3651
xorl(result, result); // return false
3652
3653
// That's it
3654
bind(DONE);
3655
if (UseAVX >= 2 && UseSSE >= 2) {
3656
// clean upper bits of YMM registers
3657
vpxor(vec1, vec1);
3658
vpxor(vec2, vec2);
3659
}
3660
}
3661
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3662
void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3663
Register limit, Register result, Register chr,
3664
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3665
ShortBranchVerifier sbv(this);
3666
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3667
3668
int length_offset = arrayOopDesc::length_offset_in_bytes();
3669
int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3670
3671
if (is_array_equ) {
3672
// Check the input args
3673
cmpoop(ary1, ary2);
3674
jcc(Assembler::equal, TRUE_LABEL);
3675
3676
// Need additional checks for arrays_equals.
3677
testptr(ary1, ary1);
3678
jcc(Assembler::zero, FALSE_LABEL);
3679
testptr(ary2, ary2);
3680
jcc(Assembler::zero, FALSE_LABEL);
3681
3682
// Check the lengths
3683
movl(limit, Address(ary1, length_offset));
3684
cmpl(limit, Address(ary2, length_offset));
3685
jcc(Assembler::notEqual, FALSE_LABEL);
3686
}
3687
3688
// count == 0
3689
testl(limit, limit);
3690
jcc(Assembler::zero, TRUE_LABEL);
3691
3692
if (is_array_equ) {
3693
// Load array address
3694
lea(ary1, Address(ary1, base_offset));
3695
lea(ary2, Address(ary2, base_offset));
3696
}
3697
3698
if (is_array_equ && is_char) {
3699
// arrays_equals when used for char[].
3700
shll(limit, 1); // byte count != 0
3701
}
3702
movl(result, limit); // copy
3703
3704
if (UseAVX >= 2) {
3705
// With AVX2, use 32-byte vector compare
3706
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3707
3708
// Compare 32-byte vectors
3709
andl(result, 0x0000001f); // tail count (in bytes)
3710
andl(limit, 0xffffffe0); // vector count (in bytes)
3711
jcc(Assembler::zero, COMPARE_TAIL);
3712
3713
lea(ary1, Address(ary1, limit, Address::times_1));
3714
lea(ary2, Address(ary2, limit, Address::times_1));
3715
negptr(limit);
3716
3717
#ifdef _LP64
3718
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3719
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3720
3721
cmpl(limit, -64);
3722
jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3723
3724
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3725
3726
evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3727
evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3728
kortestql(mask, mask);
3729
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3730
addptr(limit, 64); // update since we already compared at this addr
3731
cmpl(limit, -64);
3732
jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3733
3734
// At this point we may still need to compare -limit+result bytes.
3735
// We could execute the next two instruction and just continue via non-wide path:
3736
// cmpl(limit, 0);
3737
// jcc(Assembler::equal, COMPARE_TAIL); // true
3738
// But since we stopped at the points ary{1,2}+limit which are
3739
// not farther than 64 bytes from the ends of arrays ary{1,2}+result
3740
// (|limit| <= 32 and result < 32),
3741
// we may just compare the last 64 bytes.
3742
//
3743
addptr(result, -64); // it is safe, bc we just came from this area
3744
evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3745
evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3746
kortestql(mask, mask);
3747
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3748
3749
jmp(TRUE_LABEL);
3750
3751
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3752
3753
}//if (VM_Version::supports_avx512vlbw())
3754
#endif //_LP64
3755
bind(COMPARE_WIDE_VECTORS);
3756
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3757
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3758
vpxor(vec1, vec2);
3759
3760
vptest(vec1, vec1);
3761
jcc(Assembler::notZero, FALSE_LABEL);
3762
addptr(limit, 32);
3763
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3764
3765
testl(result, result);
3766
jcc(Assembler::zero, TRUE_LABEL);
3767
3768
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3769
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3770
vpxor(vec1, vec2);
3771
3772
vptest(vec1, vec1);
3773
jccb(Assembler::notZero, FALSE_LABEL);
3774
jmpb(TRUE_LABEL);
3775
3776
bind(COMPARE_TAIL); // limit is zero
3777
movl(limit, result);
3778
// Fallthru to tail compare
3779
} else if (UseSSE42Intrinsics) {
3780
// With SSE4.2, use double quad vector compare
3781
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3782
3783
// Compare 16-byte vectors
3784
andl(result, 0x0000000f); // tail count (in bytes)
3785
andl(limit, 0xfffffff0); // vector count (in bytes)
3786
jcc(Assembler::zero, COMPARE_TAIL);
3787
3788
lea(ary1, Address(ary1, limit, Address::times_1));
3789
lea(ary2, Address(ary2, limit, Address::times_1));
3790
negptr(limit);
3791
3792
bind(COMPARE_WIDE_VECTORS);
3793
movdqu(vec1, Address(ary1, limit, Address::times_1));
3794
movdqu(vec2, Address(ary2, limit, Address::times_1));
3795
pxor(vec1, vec2);
3796
3797
ptest(vec1, vec1);
3798
jcc(Assembler::notZero, FALSE_LABEL);
3799
addptr(limit, 16);
3800
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3801
3802
testl(result, result);
3803
jcc(Assembler::zero, TRUE_LABEL);
3804
3805
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3806
movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3807
pxor(vec1, vec2);
3808
3809
ptest(vec1, vec1);
3810
jccb(Assembler::notZero, FALSE_LABEL);
3811
jmpb(TRUE_LABEL);
3812
3813
bind(COMPARE_TAIL); // limit is zero
3814
movl(limit, result);
3815
// Fallthru to tail compare
3816
}
3817
3818
// Compare 4-byte vectors
3819
andl(limit, 0xfffffffc); // vector count (in bytes)
3820
jccb(Assembler::zero, COMPARE_CHAR);
3821
3822
lea(ary1, Address(ary1, limit, Address::times_1));
3823
lea(ary2, Address(ary2, limit, Address::times_1));
3824
negptr(limit);
3825
3826
bind(COMPARE_VECTORS);
3827
movl(chr, Address(ary1, limit, Address::times_1));
3828
cmpl(chr, Address(ary2, limit, Address::times_1));
3829
jccb(Assembler::notEqual, FALSE_LABEL);
3830
addptr(limit, 4);
3831
jcc(Assembler::notZero, COMPARE_VECTORS);
3832
3833
// Compare trailing char (final 2 bytes), if any
3834
bind(COMPARE_CHAR);
3835
testl(result, 0x2); // tail char
3836
jccb(Assembler::zero, COMPARE_BYTE);
3837
load_unsigned_short(chr, Address(ary1, 0));
3838
load_unsigned_short(limit, Address(ary2, 0));
3839
cmpl(chr, limit);
3840
jccb(Assembler::notEqual, FALSE_LABEL);
3841
3842
if (is_array_equ && is_char) {
3843
bind(COMPARE_BYTE);
3844
} else {
3845
lea(ary1, Address(ary1, 2));
3846
lea(ary2, Address(ary2, 2));
3847
3848
bind(COMPARE_BYTE);
3849
testl(result, 0x1); // tail byte
3850
jccb(Assembler::zero, TRUE_LABEL);
3851
load_unsigned_byte(chr, Address(ary1, 0));
3852
load_unsigned_byte(limit, Address(ary2, 0));
3853
cmpl(chr, limit);
3854
jccb(Assembler::notEqual, FALSE_LABEL);
3855
}
3856
bind(TRUE_LABEL);
3857
movl(result, 1); // return true
3858
jmpb(DONE);
3859
3860
bind(FALSE_LABEL);
3861
xorl(result, result); // return false
3862
3863
// That's it
3864
bind(DONE);
3865
if (UseAVX >= 2) {
3866
// clean upper bits of YMM registers
3867
vpxor(vec1, vec1);
3868
vpxor(vec2, vec2);
3869
}
3870
}
3871
3872
#ifdef _LP64
3873
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3874
Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3875
assert(VM_Version::supports_avx512vlbw(), "");
3876
vpxor(xtmp, xtmp, xtmp, vec_enc);
3877
vpsubb(xtmp, xtmp, mask, vec_enc);
3878
evpmovb2m(ktmp, xtmp, vec_enc);
3879
kmovql(tmp, ktmp);
3880
switch(opc) {
3881
case Op_VectorMaskTrueCount:
3882
popcntq(dst, tmp);
3883
break;
3884
case Op_VectorMaskLastTrue:
3885
mov64(dst, -1);
3886
bsrq(tmp, tmp);
3887
cmov(Assembler::notZero, dst, tmp);
3888
break;
3889
case Op_VectorMaskFirstTrue:
3890
mov64(dst, masklen);
3891
bsfq(tmp, tmp);
3892
cmov(Assembler::notZero, dst, tmp);
3893
break;
3894
default: assert(false, "Unhandled mask operation");
3895
}
3896
}
3897
3898
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3899
XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3900
assert(VM_Version::supports_avx(), "");
3901
vpxor(xtmp, xtmp, xtmp, vec_enc);
3902
vpsubb(xtmp, xtmp, mask, vec_enc);
3903
vpmovmskb(tmp, xtmp, vec_enc);
3904
if (masklen < 64) {
3905
andq(tmp, (((jlong)1 << masklen) - 1));
3906
}
3907
switch(opc) {
3908
case Op_VectorMaskTrueCount:
3909
popcntq(dst, tmp);
3910
break;
3911
case Op_VectorMaskLastTrue:
3912
mov64(dst, -1);
3913
bsrq(tmp, tmp);
3914
cmov(Assembler::notZero, dst, tmp);
3915
break;
3916
case Op_VectorMaskFirstTrue:
3917
mov64(dst, masklen);
3918
bsfq(tmp, tmp);
3919
cmov(Assembler::notZero, dst, tmp);
3920
break;
3921
default: assert(false, "Unhandled mask operation");
3922
}
3923
}
3924
#endif
3925
3926