Path: blob/master/src/hotspot/cpu/s390/copy_s390.hpp
40930 views
/*1* Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.2* Copyright (c) 2016, 2020 SAP SE. All rights reserved.3* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.4*5* This code is free software; you can redistribute it and/or modify it6* under the terms of the GNU General Public License version 2 only, as7* published by the Free Software Foundation.8*9* This code is distributed in the hope that it will be useful, but WITHOUT10* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or11* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License12* version 2 for more details (a copy is included in the LICENSE file that13* accompanied this code).14*15* You should have received a copy of the GNU General Public License version16* 2 along with this work; if not, write to the Free Software Foundation,17* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.18*19* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA20* or visit www.oracle.com if you need additional information or have any21* questions.22*23*/2425// Major contributions by LS2627#ifndef CPU_S390_COPY_S390_HPP28#define CPU_S390_COPY_S390_HPP2930// Inline functions for memory copy and fill.3132// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a33// pointer variable), since we always run the _LP64 model. As a consequence,34// HeapWord* memory ranges are always assumed to be doubleword-aligned,35// having a size which is an integer multiple of HeapWordSize.36//37// Dealing only with doubleword-aligned doubleword units has important38// positive performance and data access consequences. Many of the move39// instructions perform particularly well under these circumstances.40// Data access is "doubleword-concurrent", except for MVC and XC.41// Furthermore, data access can be forced to be sequential (MVCL and MVCLE)42// by use of the special padding byte 0xb1, where required. For copying,43// we use padding byte 0xb0 to prevent the D-cache from being polluted.44//45// On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.46// This is optimal, even if just one HeapWord is copied. However, MVC47// copying is not atomic, i.e. not "doubleword concurrent" by definition.48//49// If the -mmvcle compiler option is specified, memcpy translates into50// code such that the entire memory range is copied or preset with just51// one MVCLE instruction.52//53// *to = *from is transformed into a MVC instruction already with -O1.54// Thus, for atomic copy operations, (inline) assembler code is required55// to guarantee atomic data accesses.56//57// For large (len >= MVCLEThreshold) chunks of memory, we exploit58// special H/W support of z/Architecture:59// 1) copy short piece of memory to page-align address(es)60// 2) copy largest part (all contained full pages) of memory using mvcle instruction.61// z/Architecture processors have special H/W support for page-aligned storage62// where len is an int multiple of page size. In that case, up to 4 cache lines are63// processed in parallel and L1 cache is not polluted.64// 3) copy the remaining piece of memory.65//66// Measurement classifications:67// very rare - <= 10.000 calls AND <= 1.000 usec elapsed68// rare - <= 100.000 calls AND <= 10.000 usec elapsed69// some - <= 1.000.000 calls AND <= 100.000 usec elapsed70// freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed71// very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed7273#undef USE_INLINE_ASM7475static void copy_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {76if (from > to) {77while (count-- > 0) {78// Copy forwards79*to++ = *from++;80}81} else {82from += count - 1;83to += count - 1;84while (count-- > 0) {85// Copy backwards86*to-- = *from--;87}88}89}9091static void copy_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {92if (from > to) {93while (count-- > 0) {94// Copy forwards95*to++ = *from++;96}97} else {98from += count - 1;99to += count - 1;100while (count-- > 0) {101// Copy backwards102*to-- = *from--;103}104}105}106107static bool has_destructive_overlap(const char* from, char* to, size_t byte_count) {108return (from < to) && ((to-from) < (ptrdiff_t)byte_count);109}110111#ifdef USE_INLINE_ASM112113//--------------------------------------------------------------114// Atomic copying. Atomicity is given by the minimum of source115// and target alignment. Refer to mail comm with Tim Slegel/IBM.116// Only usable for disjoint source and target.117//--------------------------------------------------------------118#define MOVE8_ATOMIC_4(_to,_from) { \119unsigned long toaddr; \120unsigned long fromaddr; \121asm( \122"LG %[toaddr],%[to] \n\t" /* address of to area */ \123"LG %[fromaddr],%[from] \n\t" /* address of from area */ \124"MVC 0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \125: [to] "+Q" (_to) /* outputs */ \126, [from] "+Q" (_from) \127, [toaddr] "=a" (toaddr) \128, [fromaddr] "=a" (fromaddr) \129: \130: "cc" /* clobbered */ \131); \132}133#define MOVE8_ATOMIC_3(_to,_from) { \134unsigned long toaddr; \135unsigned long fromaddr; \136asm( \137"LG %[toaddr],%[to] \n\t" /* address of to area */ \138"LG %[fromaddr],%[from] \n\t" /* address of from area */ \139"MVC 0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \140: [to] "+Q" (_to) /* outputs */ \141, [from] "+Q" (_from) \142, [toaddr] "=a" (toaddr) \143, [fromaddr] "=a" (fromaddr) \144: \145: "cc" /* clobbered */ \146); \147}148#define MOVE8_ATOMIC_2(_to,_from) { \149unsigned long toaddr; \150unsigned long fromaddr; \151asm( \152"LG %[toaddr],%[to] \n\t" /* address of to area */ \153"LG %[fromaddr],%[from] \n\t" /* address of from area */ \154"MVC 0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \155: [to] "+Q" (_to) /* outputs */ \156, [from] "+Q" (_from) \157, [toaddr] "=a" (toaddr) \158, [fromaddr] "=a" (fromaddr) \159: \160: "cc" /* clobbered */ \161); \162}163#define MOVE8_ATOMIC_1(_to,_from) { \164unsigned long toaddr; \165unsigned long fromaddr; \166asm( \167"LG %[toaddr],%[to] \n\t" /* address of to area */ \168"LG %[fromaddr],%[from] \n\t" /* address of from area */ \169"MVC 0(8,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \170: [to] "+Q" (_to) /* outputs */ \171, [from] "+Q" (_from) \172, [toaddr] "=a" (toaddr) \173, [fromaddr] "=a" (fromaddr) \174: \175: "cc" /* clobbered */ \176); \177}178179//--------------------------------------------------------------180// Atomic copying of 8-byte entities.181// Conjoint/disjoint property does not matter. Entities are first182// loaded and then stored.183// _to and _from must be 8-byte aligned.184//--------------------------------------------------------------185#define COPY8_ATOMIC_4(_to,_from) { \186unsigned long toaddr; \187asm( \188"LG 3,%[from] \n\t" /* address of from area */ \189"LG %[toaddr],%[to] \n\t" /* address of to area */ \190"LMG 0,3,0(3) \n\t" /* load data */ \191"STMG 0,3,0(%[toaddr]) \n\t" /* store data */ \192: [to] "+Q" (_to) /* outputs */ \193, [from] "+Q" (_from) /* outputs */ \194, [toaddr] "=a" (toaddr) /* inputs */ \195: \196: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \197); \198}199#define COPY8_ATOMIC_3(_to,_from) { \200unsigned long toaddr; \201asm( \202"LG 2,%[from] \n\t" /* address of from area */ \203"LG %[toaddr],%[to] \n\t" /* address of to area */ \204"LMG 0,2,0(2) \n\t" /* load data */ \205"STMG 0,2,0(%[toaddr]) \n\t" /* store data */ \206: [to] "+Q" (_to) /* outputs */ \207, [from] "+Q" (_from) /* outputs */ \208, [toaddr] "=a" (toaddr) /* inputs */ \209: \210: "cc", "r0", "r1", "r2" /* clobbered */ \211); \212}213#define COPY8_ATOMIC_2(_to,_from) { \214unsigned long toaddr; \215asm( \216"LG 1,%[from] \n\t" /* address of from area */ \217"LG %[toaddr],%[to] \n\t" /* address of to area */ \218"LMG 0,1,0(1) \n\t" /* load data */ \219"STMG 0,1,0(%[toaddr]) \n\t" /* store data */ \220: [to] "+Q" (_to) /* outputs */ \221, [from] "+Q" (_from) /* outputs */ \222, [toaddr] "=a" (toaddr) /* inputs */ \223: \224: "cc", "r0", "r1" /* clobbered */ \225); \226}227#define COPY8_ATOMIC_1(_to,_from) { \228unsigned long addr; \229asm( \230"LG %[addr],%[from] \n\t" /* address of from area */ \231"LG 0,0(0,%[addr]) \n\t" /* load data */ \232"LG %[addr],%[to] \n\t" /* address of to area */ \233"STG 0,0(0,%[addr]) \n\t" /* store data */ \234: [to] "+Q" (_to) /* outputs */ \235, [from] "+Q" (_from) /* outputs */ \236, [addr] "=a" (addr) /* inputs */ \237: \238: "cc", "r0" /* clobbered */ \239); \240}241242//--------------------------------------------------------------243// Atomic copying of 4-byte entities.244// Exactly 4 (four) entities are copied.245// Conjoint/disjoint property does not matter. Entities are first246// loaded and then stored.247// _to and _from must be 4-byte aligned.248//--------------------------------------------------------------249#define COPY4_ATOMIC_4(_to,_from) { \250unsigned long toaddr; \251asm( \252"LG 3,%[from] \n\t" /* address of from area */ \253"LG %[toaddr],%[to] \n\t" /* address of to area */ \254"LM 0,3,0(3) \n\t" /* load data */ \255"STM 0,3,0(%[toaddr]) \n\t" /* store data */ \256: [to] "+Q" (_to) /* outputs */ \257, [from] "+Q" (_from) /* outputs */ \258, [toaddr] "=a" (toaddr) /* inputs */ \259: \260: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \261); \262}263#define COPY4_ATOMIC_3(_to,_from) { \264unsigned long toaddr; \265asm( \266"LG 2,%[from] \n\t" /* address of from area */ \267"LG %[toaddr],%[to] \n\t" /* address of to area */ \268"LM 0,2,0(2) \n\t" /* load data */ \269"STM 0,2,0(%[toaddr]) \n\t" /* store data */ \270: [to] "+Q" (_to) /* outputs */ \271, [from] "+Q" (_from) /* outputs */ \272, [toaddr] "=a" (toaddr) /* inputs */ \273: \274: "cc", "r0", "r1", "r2" /* clobbered */ \275); \276}277#define COPY4_ATOMIC_2(_to,_from) { \278unsigned long toaddr; \279asm( \280"LG 1,%[from] \n\t" /* address of from area */ \281"LG %[toaddr],%[to] \n\t" /* address of to area */ \282"LM 0,1,0(1) \n\t" /* load data */ \283"STM 0,1,0(%[toaddr]) \n\t" /* store data */ \284: [to] "+Q" (_to) /* outputs */ \285, [from] "+Q" (_from) /* outputs */ \286, [toaddr] "=a" (toaddr) /* inputs */ \287: \288: "cc", "r0", "r1" /* clobbered */ \289); \290}291#define COPY4_ATOMIC_1(_to,_from) { \292unsigned long addr; \293asm( \294"LG %[addr],%[from] \n\t" /* address of from area */ \295"L 0,0(0,%[addr]) \n\t" /* load data */ \296"LG %[addr],%[to] \n\t" /* address of to area */ \297"ST 0,0(0,%[addr]) \n\t" /* store data */ \298: [to] "+Q" (_to) /* outputs */ \299, [from] "+Q" (_from) /* outputs */ \300, [addr] "=a" (addr) /* inputs */ \301: \302: "cc", "r0" /* clobbered */ \303); \304}305306#if 0 // Waiting for gcc to support EXRL.307#define MVC_MEMCOPY(_to,_from,_len) \308if (VM_Version::has_ExecuteExtensions()) { \309asm("\t" \310" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \311" EXRL 1,1f \n\t" /* execute MVC instr */ \312" BRC 15,2f \n\t" /* skip template */ \313"1: MVC 0(%[len],%[to]),0(%[from]) \n\t" \314"2: BCR 0,0 \n\t" \315: [to] "+Q" (_to) /* outputs */ \316, [from] "+Q" (_from) /* outputs */ \317: [len] "r" (_len) /* inputs */ \318: "cc", "r1" /* clobbered */ \319); \320} else { \321asm("\t" \322" LARL 2,3f \n\t" \323" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \324" EX 1,0(2) \n\t" /* execute MVC instr */ \325" BRC 15,4f \n\t" /* skip template */ \326"3: MVC 0(%[len],%[to]),0(%[from]) \n\t" \327"4: BCR 0,0 \n\t" \328: [to] "+Q" (_to) /* outputs */ \329, [from] "+Q" (_from) /* outputs */ \330: [len] "r" (_len) /* inputs */ \331: "cc", "r1", "r2" /* clobbered */ \332); \333}334#else335#define MVC_MEMCOPY(_to,_from,_len) \336{ unsigned long toaddr; unsigned long tolen; \337unsigned long fromaddr; unsigned long target; \338asm("\t" \339" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \340" BRC 8,2f \n\t" /* do nothing for l=0*/ \341" AGHI %[tolen],-1 \n\t" \342" LG %[toaddr],%[to] \n\t" \343" LG %[fromaddr],%[from] \n\t" \344" LARL %[target],1f \n\t" /* addr of MVC instr */ \345" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \346" BRC 15,2f \n\t" /* skip template */ \347"1: MVC 0(1,%[toaddr]),0(%[fromaddr]) \n\t" \348"2: BCR 0,0 \n\t" /* nop a branch target*/\349: [to] "+Q" (_to) /* outputs */ \350, [from] "+Q" (_from) \351, [tolen] "=a" (tolen) \352, [toaddr] "=a" (toaddr) \353, [fromaddr] "=a" (fromaddr) \354, [target] "=a" (target) \355: [len] "r" (_len) /* inputs */ \356: "cc" /* clobbered */ \357); \358}359#endif360361#if 0 // code snippet to be used for debugging362/* ASSERT code BEGIN */ \363" LARL %[len],5f \n\t" \364" LARL %[mta],4f \n\t" \365" SLGR %[len],%[mta] \n\t" \366" CGHI %[len],16 \n\t" \367" BRC 7,9f \n\t" /* block size != 16 */ \368\369" LARL %[len],1f \n\t" \370" SLGR %[len],%[mta] \n\t" \371" CGHI %[len],256 \n\t" \372" BRC 7,9f \n\t" /* list len != 256 */ \373\374" LGR 0,0 \n\t" /* artificial SIGILL */ \375"9: BRC 7,-2 \n\t" \376" LARL %[mta],1f \n\t" /* restore MVC table begin */ \377/* ASSERT code END */378#endif379380// Optimized copying for data less than 4k381// - no destructive overlap382// - 0 <= _n_bytes <= 4096383// This macro needs to be gcc-compiled with -march=z990. Otherwise, the384// LAY instruction is not available.385#define MVC_MULTI(_to,_from,_n_bytes) \386{ unsigned long toaddr; \387unsigned long fromaddr; \388unsigned long movetable; \389unsigned long len; \390asm("\t" \391" LTGFR %[len],%[nby] \n\t" \392" LG %[ta],%[to] \n\t" /* address of to area */ \393" BRC 8,1f \n\t" /* nothing to copy */ \394\395" NILL %[nby],255 \n\t" /* # bytes mod 256 */ \396" LG %[fa],%[from] \n\t" /* address of from area */ \397" BRC 8,3f \n\t" /* no rest, skip copying */ \398\399" LARL %[mta],2f \n\t" /* MVC template addr */ \400" AHI %[nby],-1 \n\t" /* adjust for EX MVC */ \401\402" EX %[nby],0(%[mta]) \n\t" /* only rightmost */ \403/* 8 bits of nby used */ \404/* Since nby is <= 4096 on entry to this code, we do need */ \405/* no zero extension before using it in addr calc. */ \406" LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \407" LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \408\409"3: SRAG %[nby],%[len],8 \n\t" /* # cache lines */ \410" LARL %[mta],1f \n\t" /* MVC table begin */ \411" BRC 8,1f \n\t" /* nothing to copy */ \412\413/* Insert ASSERT code here if required. */ \414\415\416" LNGFR %[nby],%[nby] \n\t" /* negative offset into */ \417" SLLG %[nby],%[nby],4 \n\t" /* MVC table 16-byte blocks */ \418" BC 15,0(%[nby],%[mta]) \n\t" /* branch to block #ncl */ \419\420"2: MVC 0(1,%[ta]),0(%[fa]) \n\t" /* MVC template */ \421\422"4: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 4096 == l */ \423" LAY %[ta],256(0,%[ta]) \n\t" \424" LA %[fa],256(0,%[fa]) \n\t" \425"5: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3840 <= l < 4096 */ \426" LAY %[ta],256(0,%[ta]) \n\t" \427" LA %[fa],256(0,%[fa]) \n\t" \428" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3548 <= l < 3328 */ \429" LAY %[ta],256(0,%[ta]) \n\t" \430" LA %[fa],256(0,%[fa]) \n\t" \431" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3328 <= l < 3328 */ \432" LAY %[ta],256(0,%[ta]) \n\t" \433" LA %[fa],256(0,%[fa]) \n\t" \434" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3072 <= l < 3328 */ \435" LAY %[ta],256(0,%[ta]) \n\t" \436" LA %[fa],256(0,%[fa]) \n\t" \437" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2816 <= l < 3072 */ \438" LAY %[ta],256(0,%[ta]) \n\t" \439" LA %[fa],256(0,%[fa]) \n\t" \440" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2560 <= l < 2816 */ \441" LAY %[ta],256(0,%[ta]) \n\t" \442" LA %[fa],256(0,%[fa]) \n\t" \443" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2304 <= l < 2560 */ \444" LAY %[ta],256(0,%[ta]) \n\t" \445" LA %[fa],256(0,%[fa]) \n\t" \446" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2048 <= l < 2304 */ \447" LAY %[ta],256(0,%[ta]) \n\t" \448" LA %[fa],256(0,%[fa]) \n\t" \449" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1792 <= l < 2048 */ \450" LAY %[ta],256(0,%[ta]) \n\t" \451" LA %[fa],256(0,%[fa]) \n\t" \452" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1536 <= l < 1792 */ \453" LAY %[ta],256(0,%[ta]) \n\t" \454" LA %[fa],256(0,%[fa]) \n\t" \455" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1280 <= l < 1536 */ \456" LAY %[ta],256(0,%[ta]) \n\t" \457" LA %[fa],256(0,%[fa]) \n\t" \458" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1024 <= l < 1280 */ \459" LAY %[ta],256(0,%[ta]) \n\t" \460" LA %[fa],256(0,%[fa]) \n\t" \461" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 768 <= l < 1024 */ \462" LAY %[ta],256(0,%[ta]) \n\t" \463" LA %[fa],256(0,%[fa]) \n\t" \464" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 512 <= l < 768 */ \465" LAY %[ta],256(0,%[ta]) \n\t" \466" LA %[fa],256(0,%[fa]) \n\t" \467" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 256 <= l < 512 */ \468" LAY %[ta],256(0,%[ta]) \n\t" \469" LA %[fa],256(0,%[fa]) \n\t" \470"1: BCR 0,0 \n\t" /* nop as branch target */ \471: [to] "+Q" (_to) /* outputs */ \472, [from] "+Q" (_from) \473, [ta] "=a" (toaddr) \474, [fa] "=a" (fromaddr) \475, [mta] "=a" (movetable) \476, [nby] "+a" (_n_bytes) \477, [len] "=a" (len) \478: \479: "cc" /* clobbered */ \480); \481}482483#define MVCLE_MEMCOPY(_to,_from,_len) \484asm( \485" LG 0,%[to] \n\t" /* address of to area */ \486" LG 2,%[from] \n\t" /* address of from area */ \487" LGR 1,%[len] \n\t" /* len of to area */ \488" LGR 3,%[len] \n\t" /* len of from area */ \489"1: MVCLE 0,2,176 \n\t" /* copy storage, bypass cache (0xb0) */ \490" BRC 1,1b \n\t" /* retry if interrupted */ \491: [to] "+Q" (_to) /* outputs */ \492, [from] "+Q" (_from) /* outputs */ \493: [len] "r" (_len) /* inputs */ \494: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \495);496497#define MVCLE_MEMINIT(_to,_val,_len) \498asm( \499" LG 0,%[to] \n\t" /* address of to area */ \500" LGR 1,%[len] \n\t" /* len of to area */ \501" XGR 3,3 \n\t" /* from area len = 0 */ \502"1: MVCLE 0,2,0(%[val]) \n\t" /* init storage */ \503" BRC 1,1b \n\t" /* retry if interrupted */ \504: [to] "+Q" (_to) /* outputs */ \505: [len] "r" (_len) /* inputs */ \506, [val] "r" (_val) /* inputs */ \507: "cc", "r0", "r1", "r3" /* clobbered */ \508);509#define MVCLE_MEMZERO(_to,_len) \510asm( \511" LG 0,%[to] \n\t" /* address of to area */ \512" LGR 1,%[len] \n\t" /* len of to area */ \513" XGR 3,3 \n\t" /* from area len = 0 */ \514"1: MVCLE 0,2,0 \n\t" /* clear storage */ \515" BRC 1,1b \n\t" /* retry if interrupted */ \516: [to] "+Q" (_to) /* outputs */ \517: [len] "r" (_len) /* inputs */ \518: "cc", "r0", "r1", "r3" /* clobbered */ \519);520521// Clear a stretch of memory, 0 <= _len <= 256.522// There is no alignment prereq.523// There is no test for len out of range specified above.524#define XC_MEMZERO_256(_to,_len) \525{ unsigned long toaddr; unsigned long tolen; \526unsigned long target; \527asm("\t" \528" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \529" BRC 8,2f \n\t" /* do nothing for l=0*/ \530" AGHI %[tolen],-1 \n\t" /* adjust for EX XC */ \531" LARL %[target],1f \n\t" /* addr of XC instr */ \532" LG %[toaddr],%[to] \n\t" /* addr of data area */ \533" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \534" BRC 15,2f \n\t" /* skip template */ \535"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \536"2: BCR 0,0 \n\t" /* nop a branch target*/\537: [to] "+Q" (_to) /* outputs */ \538, [tolen] "=a" (tolen) \539, [toaddr] "=a" (toaddr) \540, [target] "=a" (target) \541: [len] "r" (_len) /* inputs */ \542: "cc" /* clobbered */ \543); \544}545546// Clear a stretch of memory, 256 < _len.547// XC_MEMZERO_256 may be used to clear shorter areas.548//549// The code550// - first zeroes a few bytes to align on a HeapWord.551// This step is currently inactive because all calls seem552// to have their data aligned on HeapWord boundaries.553// - then zeroes a few HeapWords to align on a cache line.554// - then zeroes entire cache lines in a loop.555// - then zeroes the remaining (partial) cache line.556#if 1557#define XC_MEMZERO_ANY(_to,_len) \558{ unsigned long toaddr; unsigned long tolen; \559unsigned long len8; unsigned long len256; \560unsigned long target; unsigned long lenx; \561asm("\t" \562" LTGR %[tolen],%[len] \n\t" /* */ \563" BRC 8,2f \n\t" /* do nothing for l=0*/ \564" LG %[toaddr],%[to] \n\t" /* addr of data area */ \565" LARL %[target],1f \n\t" /* addr of XC instr */ \566" " \567" LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\568" NILL %[len256],0xff \n\t" \569" BRC 8,4f \n\t" /* already aligned */ \570" NILH %[len256],0x00 \n\t" /* zero extend */ \571" LLGFR %[len256],%[len256] \n\t" \572" LAY %[lenx],-1(,%[len256]) \n\t" \573" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \574" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \575" SGR %[tolen],%[len256] \n\t" /* adjust len */ \576" " \577"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \578" BRC 8,6f \n\t" /* no full cache lines */ \579"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \580" LA %[toaddr],256(,%[toaddr]) \n\t" \581" BRCTG %[lenx],5b \n\t" /* iterate */ \582" " \583"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \584" BRC 8,2f \n\t" /* done if none */ \585" LAY %[lenx],-1(,%[tolen]) \n\t" \586" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \587" BRC 15,2f \n\t" /* skip template */ \588" " \589"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \590"2: BCR 0,0 \n\t" /* nop a branch target */ \591: [to] "+Q" (_to) /* outputs */ \592, [lenx] "=a" (lenx) \593, [len256] "=a" (len256) \594, [tolen] "=a" (tolen) \595, [toaddr] "=a" (toaddr) \596, [target] "=a" (target) \597: [len] "r" (_len) /* inputs */ \598: "cc" /* clobbered */ \599); \600}601#else602#define XC_MEMZERO_ANY(_to,_len) \603{ unsigned long toaddr; unsigned long tolen; \604unsigned long len8; unsigned long len256; \605unsigned long target; unsigned long lenx; \606asm("\t" \607" LTGR %[tolen],%[len] \n\t" /* */ \608" BRC 8,2f \n\t" /* do nothing for l=0*/ \609" LG %[toaddr],%[to] \n\t" /* addr of data area */ \610" LARL %[target],1f \n\t" /* addr of XC instr */ \611" " \612" LCGR %[len8],%[toaddr] \n\t" /* HeapWord alignment */ \613" NILL %[len8],0x07 \n\t" \614" BRC 8,3f \n\t" /* already aligned */ \615" NILH %[len8],0x00 \n\t" /* zero extend */ \616" LLGFR %[len8],%[len8] \n\t" \617" LAY %[lenx],-1(,%[len8]) \n\t" \618" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \619" LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \620" SGR %[tolen],%[len8] \n\t" /* adjust len */ \621" " \622"3: LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\623" NILL %[len256],0xff \n\t" \624" BRC 8,4f \n\t" /* already aligned */ \625" NILH %[len256],0x00 \n\t" /* zero extend */ \626" LLGFR %[len256],%[len256] \n\t" \627" LAY %[lenx],-1(,%[len256]) \n\t" \628" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \629" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \630" SGR %[tolen],%[len256] \n\t" /* adjust len */ \631" " \632"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \633" BRC 8,6f \n\t" /* no full cache lines */ \634"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \635" LA %[toaddr],256(,%[toaddr]) \n\t" \636" BRCTG %[lenx],5b \n\t" /* iterate */ \637" " \638"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \639" BRC 8,2f \n\t" /* done if none */ \640" LAY %[lenx],-1(,%[tolen]) \n\t" \641" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \642" BRC 15,2f \n\t" /* skip template */ \643" " \644"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \645"2: BCR 0,0 \n\t" /* nop a branch target */ \646: [to] "+Q" (_to) /* outputs */ \647, [lenx] "=a" (lenx) \648, [len8] "=a" (len8) \649, [len256] "=a" (len256) \650, [tolen] "=a" (tolen) \651, [toaddr] "=a" (toaddr) \652, [target] "=a" (target) \653: [len] "r" (_len) /* inputs */ \654: "cc" /* clobbered */ \655); \656}657#endif658#endif // USE_INLINE_ASM659660//*************************************//661// D I S J O I N T C O P Y I N G //662//*************************************//663664static void pd_aligned_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {665// JVM2008: very frequent, some tests frequent.666667// Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.668// MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands669// are DW aligned and the length is an integer multiple of a DW. Should always be true here.670//671// No special exploit needed. H/W discovers suitable situations itself.672//673// For large chunks of memory, exploit special H/W support of z/Architecture:674// 1) copy short piece of memory to page-align address(es)675// 2) copy largest part (all contained full pages) of memory using mvcle instruction.676// z/Architecture processors have special H/W support for page-aligned storage677// where len is an int multiple of page size. In that case, up to 4 cache lines are678// processed in parallel and L1 cache is not polluted.679// 3) copy the remaining piece of memory.680//681#ifdef USE_INLINE_ASM682jbyte* to_bytes = (jbyte*)to;683jbyte* from_bytes = (jbyte*)from;684size_t len_bytes = count*HeapWordSize;685686// Optimized copying for data less than 4k687switch (count) {688case 0: return;689case 1: MOVE8_ATOMIC_1(to,from)690return;691case 2: MOVE8_ATOMIC_2(to,from)692return;693// case 3: MOVE8_ATOMIC_3(to,from)694// return;695// case 4: MOVE8_ATOMIC_4(to,from)696// return;697default:698if (len_bytes <= 4096) {699MVC_MULTI(to,from,len_bytes)700return;701}702// else703MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)704return;705}706#else707// Fallback code.708switch (count) {709case 0:710return;711712case 1:713*to = *from;714return;715716case 2:717*to++ = *from++;718*to = *from;719return;720721case 3:722*to++ = *from++;723*to++ = *from++;724*to = *from;725return;726727case 4:728*to++ = *from++;729*to++ = *from++;730*to++ = *from++;731*to = *from;732return;733734default:735while (count-- > 0)736*(to++) = *(from++);737return;738}739#endif740}741742static void pd_disjoint_words_atomic(const HeapWord* from, HeapWord* to, size_t count) {743// JVM2008: < 4k calls.744assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");745pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.746}747748static void pd_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {749// JVM2008: very rare.750pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.751}752753754//*************************************//755// C O N J O I N T C O P Y I N G //756//*************************************//757758static void pd_aligned_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {759// JVM2008: between some and lower end of frequent.760761#ifdef USE_INLINE_ASM762size_t count_in = count;763if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {764switch (count_in) {765case 4: COPY8_ATOMIC_4(to,from)766return;767case 3: COPY8_ATOMIC_3(to,from)768return;769case 2: COPY8_ATOMIC_2(to,from)770return;771case 1: COPY8_ATOMIC_1(to,from)772return;773case 0: return;774default:775from += count_in;776to += count_in;777while (count_in-- > 0)778*(--to) = *(--from); // Copy backwards, areas overlap destructively.779return;780}781}782// else783jbyte* to_bytes = (jbyte*)to;784jbyte* from_bytes = (jbyte*)from;785size_t len_bytes = count_in*BytesPerLong;786MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)787return;788#else789// Fallback code.790if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {791HeapWord t1, t2, t3;792switch (count) {793case 0:794return;795796case 1:797*to = *from;798return;799800case 2:801t1 = *(from+1);802*to = *from;803*(to+1) = t1;804return;805806case 3:807t1 = *(from+1);808t2 = *(from+2);809*to = *from;810*(to+1) = t1;811*(to+2) = t2;812return;813814case 4:815t1 = *(from+1);816t2 = *(from+2);817t3 = *(from+3);818*to = *from;819*(to+1) = t1;820*(to+2) = t2;821*(to+3) = t3;822return;823824default:825from += count;826to += count;827while (count-- > 0)828*(--to) = *(--from); // Copy backwards, areas overlap destructively.829return;830}831}832// else833// Just delegate. HeapWords are optimally aligned anyway.834pd_aligned_disjoint_words(from, to, count);835#endif836}837838static void pd_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {839840// Just delegate. HeapWords are optimally aligned anyway.841pd_aligned_conjoint_words(from, to, count);842}843844static void pd_conjoint_bytes(const void* from, void* to, size_t count) {845846#ifdef USE_INLINE_ASM847size_t count_in = count;848if (has_destructive_overlap((char*)from, (char*)to, count_in))849(void)memmove(to, from, count_in);850else {851jbyte* to_bytes = (jbyte*)to;852jbyte* from_bytes = (jbyte*)from;853size_t len_bytes = count_in;854MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)855}856#else857if (has_destructive_overlap((char*)from, (char*)to, count))858(void)memmove(to, from, count);859else860(void)memcpy(to, from, count);861#endif862}863864//**************************************************//865// C O N J O I N T A T O M I C C O P Y I N G //866//**************************************************//867868static void pd_conjoint_bytes_atomic(const void* from, void* to, size_t count) {869// Call arraycopy stubs to do the job.870pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.871}872873static void pd_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {874875#ifdef USE_INLINE_ASM876size_t count_in = count;877if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerShort)) {878// Use optimizations from shared code where no z-specific optimization exists.879copy_conjoint_jshorts_atomic(from, to, count);880} else {881jbyte* to_bytes = (jbyte*)to;882jbyte* from_bytes = (jbyte*)from;883size_t len_bytes = count_in*BytesPerShort;884MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)885}886#else887// Use optimizations from shared code where no z-specific optimization exists.888copy_conjoint_jshorts_atomic(from, to, count);889#endif890}891892static void pd_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {893894#ifdef USE_INLINE_ASM895size_t count_in = count;896if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerInt)) {897switch (count_in) {898case 4: COPY4_ATOMIC_4(to,from)899return;900case 3: COPY4_ATOMIC_3(to,from)901return;902case 2: COPY4_ATOMIC_2(to,from)903return;904case 1: COPY4_ATOMIC_1(to,from)905return;906case 0: return;907default:908// Use optimizations from shared code where no z-specific optimization exists.909copy_conjoint_jints_atomic(from, to, count_in);910return;911}912}913// else914jbyte* to_bytes = (jbyte*)to;915jbyte* from_bytes = (jbyte*)from;916size_t len_bytes = count_in*BytesPerInt;917MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)918#else919// Use optimizations from shared code where no z-specific optimization exists.920copy_conjoint_jints_atomic(from, to, count);921#endif922}923924static void pd_conjoint_jlongs_atomic(const jlong* from, jlong* to, size_t count) {925926#ifdef USE_INLINE_ASM927size_t count_in = count;928if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {929switch (count_in) {930case 4: COPY8_ATOMIC_4(to,from) return;931case 3: COPY8_ATOMIC_3(to,from) return;932case 2: COPY8_ATOMIC_2(to,from) return;933case 1: COPY8_ATOMIC_1(to,from) return;934case 0: return;935default:936from += count_in;937to += count_in;938while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.939return;940}941}942// else {943jbyte* to_bytes = (jbyte*)to;944jbyte* from_bytes = (jbyte*)from;945size_t len_bytes = count_in*BytesPerLong;946MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)947#else948size_t count_in = count;949if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {950if (count_in < 8) {951from += count_in;952to += count_in;953while (count_in-- > 0)954*(--to) = *(--from); // Copy backwards, areas overlap destructively.955return;956}957// else {958from += count_in-1;959to += count_in-1;960if (count_in&0x01) {961*(to--) = *(from--);962count_in--;963}964for (; count_in>0; count_in-=2) {965*to = *from;966*(to-1) = *(from-1);967to -= 2;968from -= 2;969}970}971else972pd_aligned_disjoint_words((const HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.973#endif974}975976static void pd_conjoint_oops_atomic(const oop* from, oop* to, size_t count) {977978#ifdef USE_INLINE_ASM979size_t count_in = count;980if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {981switch (count_in) {982case 4: COPY8_ATOMIC_4(to,from) return;983case 3: COPY8_ATOMIC_3(to,from) return;984case 2: COPY8_ATOMIC_2(to,from) return;985case 1: COPY8_ATOMIC_1(to,from) return;986case 0: return;987default:988from += count_in;989to += count_in;990while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.991return;992}993}994// else995jbyte* to_bytes = (jbyte*)to;996jbyte* from_bytes = (jbyte*)from;997size_t len_bytes = count_in*BytesPerOop;998MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)999#else1000size_t count_in = count;1001if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {1002from += count_in;1003to += count_in;1004while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.1005return;1006}1007// else1008pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.1009return;1010#endif1011}10121013static void pd_arrayof_conjoint_bytes(const HeapWord* from, HeapWord* to, size_t count) {1014pd_conjoint_bytes_atomic(from, to, count);1015}10161017static void pd_arrayof_conjoint_jshorts(const HeapWord* from, HeapWord* to, size_t count) {1018pd_conjoint_jshorts_atomic((const jshort*)from, (jshort*)to, count);1019}10201021static void pd_arrayof_conjoint_jints(const HeapWord* from, HeapWord* to, size_t count) {1022pd_conjoint_jints_atomic((const jint*)from, (jint*)to, count);1023}10241025static void pd_arrayof_conjoint_jlongs(const HeapWord* from, HeapWord* to, size_t count) {1026pd_conjoint_jlongs_atomic((const jlong*)from, (jlong*)to, count);1027}10281029static void pd_arrayof_conjoint_oops(const HeapWord* from, HeapWord* to, size_t count) {1030pd_conjoint_oops_atomic((const oop*)from, (oop*)to, count);1031}10321033//**********************************************//1034// M E M O R Y I N I T I A L I S A T I O N //1035//**********************************************//10361037static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {1038// JVM2008: very rare, only in some tests.1039#ifdef USE_INLINE_ASM1040// Initialize storage to a given value. Use memset instead of copy loop.1041// For large chunks of memory, exploit special H/W support of z/Architecture:1042// 1) init short piece of memory to page-align address1043// 2) init largest part (all contained full pages) of memory using mvcle instruction.1044// z/Architecture processors have special H/W support for page-aligned storage1045// where len is an int multiple of page size. In that case, up to 4 cache lines are1046// processed in parallel and L1 cache is not polluted.1047// 3) init the remaining piece of memory.1048// Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.1049// If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.10501051jbyte* to_bytes = (jbyte*)to;1052size_t len_bytes = count;10531054MVCLE_MEMINIT(to_bytes, value, len_bytes)10551056#else1057// Memset does the best job possible: loop over 256-byte MVCs, with1058// the last MVC EXecuted. With the -mmvcle option, initialization1059// is done using MVCLE -> slight advantage for large areas.1060(void)memset(to, value, count);1061#endif1062}10631064static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {1065// Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.1066// JVM2008: < 4k calls.1067if (value == 0) {1068pd_zero_to_words(tohw, count);1069return;1070}1071if (value == ~(juint)(0)) {1072pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));1073return;1074}1075julong* to = (julong*) tohw;1076julong v = ((julong) value << 32) | value;1077while (count-- > 0) {1078*to++ = v;1079}1080}10811082static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {1083// JVM2008: very frequent, but virtually all calls are with value == 0.1084pd_fill_to_words(tohw, count, value);1085}10861087//**********************************//1088// M E M O R Y C L E A R I N G //1089//**********************************//10901091// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.1092// Distinguish between simple and large zero_to_words.1093static void pd_zero_to_words(HeapWord* tohw, size_t count) {1094pd_zero_to_bytes(tohw, count*HeapWordSize);1095}10961097static void pd_zero_to_bytes(void* to, size_t count) {1098// JVM2008: some calls (generally), some tests frequent1099#ifdef USE_INLINE_ASM1100// Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential1101// zeroing of the memory. MVCLE is not fit for that job:1102// "As observed by other CPUs and by the channel subsystem,1103// that portion of the first operand which is filled1104// with the padding byte is not necessarily stored into in1105// a left-to-right direction and may appear to be stored1106// into more than once."1107// Therefore, implementation was changed to use (multiple) XC instructions.11081109const long line_size = 256;1110jbyte* to_bytes = (jbyte*)to;1111size_t len_bytes = count;11121113if (len_bytes <= line_size) {1114XC_MEMZERO_256(to_bytes, len_bytes);1115} else {1116XC_MEMZERO_ANY(to_bytes, len_bytes);1117}11181119#else1120// Memset does the best job possible: loop over 256-byte MVCs, with1121// the last MVC EXecuted. With the -mmvcle option, initialization1122// is done using MVCLE -> slight advantage for large areas.1123(void)memset(to, 0, count);1124#endif1125}11261127#endif // CPU_S390_COPY_S390_HPP112811291130