/* SPDX-License-Identifier: GPL-2.0-or-later */1/*2* Copyright (C) 2003-2013 Altera Corporation3* All rights reserved.4*/567#include <linux/linkage.h>8#include <asm/entry.h>910.set noat11.set nobreak1213/*14* Explicitly allow the use of r1 (the assembler temporary register)15* within this code. This register is normally reserved for the use of16* the compiler.17*/1819ENTRY(instruction_trap)20ldw r1, PT_R1(sp) // Restore registers21ldw r2, PT_R2(sp)22ldw r3, PT_R3(sp)23ldw r4, PT_R4(sp)24ldw r5, PT_R5(sp)25ldw r6, PT_R6(sp)26ldw r7, PT_R7(sp)27ldw r8, PT_R8(sp)28ldw r9, PT_R9(sp)29ldw r10, PT_R10(sp)30ldw r11, PT_R11(sp)31ldw r12, PT_R12(sp)32ldw r13, PT_R13(sp)33ldw r14, PT_R14(sp)34ldw r15, PT_R15(sp)35ldw ra, PT_RA(sp)36ldw fp, PT_FP(sp)37ldw gp, PT_GP(sp)38ldw et, PT_ESTATUS(sp)39wrctl estatus, et40ldw ea, PT_EA(sp)41ldw et, PT_SP(sp) /* backup sp in et */4243addi sp, sp, PT_REGS_SIZE4445/* INSTRUCTION EMULATION46* ---------------------47*48* Nios II processors generate exceptions for unimplemented instructions.49* The routines below emulate these instructions. Depending on the50* processor core, the only instructions that might need to be emulated51* are div, divu, mul, muli, mulxss, mulxsu, and mulxuu.52*53* The emulations match the instructions, except for the following54* limitations:55*56* 1) The emulation routines do not emulate the use of the exception57* temporary register (et) as a source operand because the exception58* handler already has modified it.59*60* 2) The routines do not emulate the use of the stack pointer (sp) or61* the exception return address register (ea) as a destination because62* modifying these registers crashes the exception handler or the63* interrupted routine.64*65* Detailed Design66* ---------------67*68* The emulation routines expect the contents of integer registers r0-r3169* to be on the stack at addresses sp, 4(sp), 8(sp), ... 124(sp). The70* routines retrieve source operands from the stack and modify the71* destination register's value on the stack prior to the end of the72* exception handler. Then all registers except the destination register73* are restored to their previous values.74*75* The instruction that causes the exception is found at address -4(ea).76* The instruction's OP and OPX fields identify the operation to be77* performed.78*79* One instruction, muli, is an I-type instruction that is identified by80* an OP field of 0x24.81*82* muli AAAAA,BBBBB,IIIIIIIIIIIIIIII,-0x24-83* 27 22 6 0 <-- LSB of field84*85* The remaining emulated instructions are R-type and have an OP field86* of 0x3a. Their OPX fields identify them.87*88* R-type AAAAA,BBBBB,CCCCC,XXXXXX,NNNNN,-0x3a-89* 27 22 17 11 6 0 <-- LSB of field90*91*92* Opcode Encoding. muli is identified by its OP value. Then OPX & 0x0293* is used to differentiate between the division opcodes and the94* remaining multiplication opcodes.95*96* Instruction OP OPX OPX & 0x0297* ----------- ---- ---- ----------98* muli 0x2499* divu 0x3a 0x24 0100* div 0x3a 0x25 0101* mul 0x3a 0x27 != 0102* mulxuu 0x3a 0x07 != 0103* mulxsu 0x3a 0x17 != 0104* mulxss 0x3a 0x1f != 0105*/106107108/*109* Save everything on the stack to make it easy for the emulation110* routines to retrieve the source register operands.111*/112113addi sp, sp, -128114stw zero, 0(sp) /* Save zero on stack to avoid special case for r0. */115stw r1, 4(sp)116stw r2, 8(sp)117stw r3, 12(sp)118stw r4, 16(sp)119stw r5, 20(sp)120stw r6, 24(sp)121stw r7, 28(sp)122stw r8, 32(sp)123stw r9, 36(sp)124stw r10, 40(sp)125stw r11, 44(sp)126stw r12, 48(sp)127stw r13, 52(sp)128stw r14, 56(sp)129stw r15, 60(sp)130stw r16, 64(sp)131stw r17, 68(sp)132stw r18, 72(sp)133stw r19, 76(sp)134stw r20, 80(sp)135stw r21, 84(sp)136stw r22, 88(sp)137stw r23, 92(sp)138/* Don't bother to save et. It's already been changed. */139rdctl r5, estatus140stw r5, 100(sp)141142stw gp, 104(sp)143stw et, 108(sp) /* et contains previous sp value. */144stw fp, 112(sp)145stw ea, 116(sp)146stw ra, 120(sp)147148149/*150* Split the instruction into its fields. We need 4*A, 4*B, and 4*C as151* offsets to the stack pointer for access to the stored register values.152*/153ldw r2,-4(ea) /* r2 = AAAAA,BBBBB,IIIIIIIIIIIIIIII,PPPPPP */154roli r3, r2, 7 /* r3 = BBB,IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BB */155roli r4, r3, 3 /* r4 = IIIIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB */156roli r5, r4, 2 /* r5 = IIIIIIIIIIIIII,PPPPPP,AAAAA,BBBBB,II */157srai r4, r4, 16 /* r4 = (sign-extended) IMM16 */158roli r6, r5, 5 /* r6 = XXXX,NNNNN,PPPPPP,AAAAA,BBBBB,CCCCC,XX */159andi r2, r2, 0x3f /* r2 = 00000000000000000000000000,PPPPPP */160andi r3, r3, 0x7c /* r3 = 0000000000000000000000000,AAAAA,00 */161andi r5, r5, 0x7c /* r5 = 0000000000000000000000000,BBBBB,00 */162andi r6, r6, 0x7c /* r6 = 0000000000000000000000000,CCCCC,00 */163164/* Now165* r2 = OP166* r3 = 4*A167* r4 = IMM16 (sign extended)168* r5 = 4*B169* r6 = 4*C170*/171172/*173* Get the operands.174*175* It is necessary to check for muli because it uses an I-type176* instruction format, while the other instructions are have an R-type177* format.178*179* Prepare for either multiplication or division loop.180* They both loop 32 times.181*/182movi r14, 32183184add r3, r3, sp /* r3 = address of A-operand. */185ldw r3, 0(r3) /* r3 = A-operand. */186movi r7, 0x24 /* muli opcode (I-type instruction format) */187beq r2, r7, mul_immed /* muli doesn't use the B register as a source */188189add r5, r5, sp /* r5 = address of B-operand. */190ldw r5, 0(r5) /* r5 = B-operand. */191/* r4 = SSSSSSSSSSSSSSSS,-----IMM16------ */192/* IMM16 not needed, align OPX portion */193/* r4 = SSSSSSSSSSSSSSSS,CCCCC,-OPX--,00000 */194srli r4, r4, 5 /* r4 = 00000,SSSSSSSSSSSSSSSS,CCCCC,-OPX-- */195andi r4, r4, 0x3f /* r4 = 00000000000000000000000000,-OPX-- */196197/* Now198* r2 = OP199* r3 = src1200* r5 = src2201* r4 = OPX (no longer can be muli)202* r6 = 4*C203*/204205206/*207* Multiply or Divide?208*/209andi r7, r4, 0x02 /* For R-type multiply instructions,210OPX & 0x02 != 0 */211bne r7, zero, multiply212213214/* DIVISION215*216* Divide an unsigned dividend by an unsigned divisor using217* a shift-and-subtract algorithm. The example below shows218* 43 div 7 = 6 for 8-bit integers. This classic algorithm uses a219* single register to store both the dividend and the quotient,220* allowing both values to be shifted with a single instruction.221*222* remainder dividend:quotient223* --------- -----------------224* initialize 00000000 00101011:225* shift 00000000 0101011:_226* remainder >= divisor? no 00000000 0101011:0227* shift 00000000 101011:0_228* remainder >= divisor? no 00000000 101011:00229* shift 00000001 01011:00_230* remainder >= divisor? no 00000001 01011:000231* shift 00000010 1011:000_232* remainder >= divisor? no 00000010 1011:0000233* shift 00000101 011:0000_234* remainder >= divisor? no 00000101 011:00000235* shift 00001010 11:00000_236* remainder >= divisor? yes 00001010 11:000001237* remainder -= divisor - 00000111238* ----------239* 00000011 11:000001240* shift 00000111 1:000001_241* remainder >= divisor? yes 00000111 1:0000011242* remainder -= divisor - 00000111243* ----------244* 00000000 1:0000011245* shift 00000001 :0000011_246* remainder >= divisor? no 00000001 :00000110247*248* The quotient is 00000110.249*/250251divide:252/*253* Prepare for division by assuming the result254* is unsigned, and storing its "sign" as 0.255*/256movi r17, 0257258259/* Which division opcode? */260xori r7, r4, 0x25 /* OPX of div */261bne r7, zero, unsigned_division262263264/*265* OPX is div. Determine and store the sign of the quotient.266* Then take the absolute value of both operands.267*/268xor r17, r3, r5 /* MSB contains sign of quotient */269bge r3,zero,dividend_is_nonnegative270sub r3, zero, r3 /* -r3 */271dividend_is_nonnegative:272bge r5, zero, divisor_is_nonnegative273sub r5, zero, r5 /* -r5 */274divisor_is_nonnegative:275276277unsigned_division:278/* Initialize the unsigned-division loop. */279movi r13, 0 /* remainder = 0 */280281/* Now282* r3 = dividend : quotient283* r4 = 0x25 for div, 0x24 for divu284* r5 = divisor285* r13 = remainder286* r14 = loop counter (already initialized to 32)287* r17 = MSB contains sign of quotient288*/289290291/*292* for (count = 32; count > 0; --count)293* {294*/295divide_loop:296297/*298* Division:299*300* (remainder:dividend:quotient) <<= 1;301*/302slli r13, r13, 1303cmplt r7, r3, zero /* r7 = MSB of r3 */304or r13, r13, r7305slli r3, r3, 1306307308/*309* if (remainder >= divisor)310* {311* set LSB of quotient312* remainder -= divisor;313* }314*/315bltu r13, r5, div_skip316ori r3, r3, 1317sub r13, r13, r5318div_skip:319320/*321* }322*/323subi r14, r14, 1324bne r14, zero, divide_loop325326327/* Now328* r3 = quotient329* r4 = 0x25 for div, 0x24 for divu330* r6 = 4*C331* r17 = MSB contains sign of quotient332*/333334335/*336* Conditionally negate signed quotient. If quotient is unsigned,337* the sign already is initialized to 0.338*/339bge r17, zero, quotient_is_nonnegative340sub r3, zero, r3 /* -r3 */341quotient_is_nonnegative:342343344/*345* Final quotient is in r3.346*/347add r6, r6, sp348stw r3, 0(r6) /* write quotient to stack */349br restore_registers350351352353354/* MULTIPLICATION355*356* A "product" is the number that one gets by summing a "multiplicand"357* several times. The "multiplier" specifies the number of copies of the358* multiplicand that are summed.359*360* Actual multiplication algorithms don't use repeated addition, however.361* Shift-and-add algorithms get the same answer as repeated addition, and362* they are faster. To compute the lower half of a product (pppp below)363* one shifts the product left before adding in each of the partial364* products (a * mmmm) through (d * mmmm).365*366* To compute the upper half of a product (PPPP below), one adds in the367* partial products (d * mmmm) through (a * mmmm), each time following368* the add by a right shift of the product.369*370* mmmm371* * abcd372* ------373* #### = d * mmmm374* #### = c * mmmm375* #### = b * mmmm376* #### = a * mmmm377* --------378* PPPPpppp379*380* The example above shows 4 partial products. Computing actual Nios II381* products requires 32 partials.382*383* It is possible to compute the result of mulxsu from the result of384* mulxuu because the only difference between the results of these two385* opcodes is the value of the partial product associated with the sign386* bit of rA.387*388* mulxsu = mulxuu - (rA < 0) ? rB : 0;389*390* It is possible to compute the result of mulxss from the result of391* mulxsu because the only difference between the results of these two392* opcodes is the value of the partial product associated with the sign393* bit of rB.394*395* mulxss = mulxsu - (rB < 0) ? rA : 0;396*397*/398399mul_immed:400/* Opcode is muli. Change it into mul for remainder of algorithm. */401mov r6, r5 /* Field B is dest register, not field C. */402mov r5, r4 /* Field IMM16 is src2, not field B. */403movi r4, 0x27 /* OPX of mul is 0x27 */404405multiply:406/* Initialize the multiplication loop. */407movi r9, 0 /* mul_product = 0 */408movi r10, 0 /* mulxuu_product = 0 */409mov r11, r5 /* save original multiplier for mulxsu and mulxss */410mov r12, r5 /* mulxuu_multiplier (will be shifted) */411movi r16, 1 /* used to create "rori B,A,1" from "ror B,A,r16" */412413/* Now414* r3 = multiplicand415* r5 = mul_multiplier416* r6 = 4 * dest_register (used later as offset to sp)417* r7 = temp418* r9 = mul_product419* r10 = mulxuu_product420* r11 = original multiplier421* r12 = mulxuu_multiplier422* r14 = loop counter (already initialized)423* r16 = 1424*/425426427/*428* for (count = 32; count > 0; --count)429* {430*/431multiply_loop:432433/*434* mul_product <<= 1;435* lsb = multiplier & 1;436*/437slli r9, r9, 1438andi r7, r12, 1439440/*441* if (lsb == 1)442* {443* mulxuu_product += multiplicand;444* }445*/446beq r7, zero, mulx_skip447add r10, r10, r3448cmpltu r7, r10, r3 /* Save the carry from the MSB of mulxuu_product. */449ror r7, r7, r16 /* r7 = 0x80000000 on carry, or else 0x00000000 */450mulx_skip:451452/*453* if (MSB of mul_multiplier == 1)454* {455* mul_product += multiplicand;456* }457*/458bge r5, zero, mul_skip459add r9, r9, r3460mul_skip:461462/*463* mulxuu_product >>= 1; logical shift464* mul_multiplier <<= 1; done with MSB465* mulx_multiplier >>= 1; done with LSB466*/467srli r10, r10, 1468or r10, r10, r7 /* OR in the saved carry bit. */469slli r5, r5, 1470srli r12, r12, 1471472473/*474* }475*/476subi r14, r14, 1477bne r14, zero, multiply_loop478479480/*481* Multiply emulation loop done.482*/483484/* Now485* r3 = multiplicand486* r4 = OPX487* r6 = 4 * dest_register (used later as offset to sp)488* r7 = temp489* r9 = mul_product490* r10 = mulxuu_product491* r11 = original multiplier492*/493494495/* Calculate address for result from 4 * dest_register */496add r6, r6, sp497498499/*500* Select/compute the result based on OPX.501*/502503504/* OPX == mul? Then store. */505xori r7, r4, 0x27506beq r7, zero, store_product507508/* It's one of the mulx.. opcodes. Move over the result. */509mov r9, r10510511/* OPX == mulxuu? Then store. */512xori r7, r4, 0x07513beq r7, zero, store_product514515/* Compute mulxsu516*517* mulxsu = mulxuu - (rA < 0) ? rB : 0;518*/519bge r3, zero, mulxsu_skip520sub r9, r9, r11521mulxsu_skip:522523/* OPX == mulxsu? Then store. */524xori r7, r4, 0x17525beq r7, zero, store_product526527/* Compute mulxss528*529* mulxss = mulxsu - (rB < 0) ? rA : 0;530*/531bge r11,zero,mulxss_skip532sub r9, r9, r3533mulxss_skip:534/* At this point, assume that OPX is mulxss, so store*/535536537store_product:538stw r9, 0(r6)539540541restore_registers:542/* No need to restore r0. */543ldw r5, 100(sp)544wrctl estatus, r5545546ldw r1, 4(sp)547ldw r2, 8(sp)548ldw r3, 12(sp)549ldw r4, 16(sp)550ldw r5, 20(sp)551ldw r6, 24(sp)552ldw r7, 28(sp)553ldw r8, 32(sp)554ldw r9, 36(sp)555ldw r10, 40(sp)556ldw r11, 44(sp)557ldw r12, 48(sp)558ldw r13, 52(sp)559ldw r14, 56(sp)560ldw r15, 60(sp)561ldw r16, 64(sp)562ldw r17, 68(sp)563ldw r18, 72(sp)564ldw r19, 76(sp)565ldw r20, 80(sp)566ldw r21, 84(sp)567ldw r22, 88(sp)568ldw r23, 92(sp)569/* Does not need to restore et */570ldw gp, 104(sp)571572ldw fp, 112(sp)573ldw ea, 116(sp)574ldw ra, 120(sp)575ldw sp, 108(sp) /* last restore sp */576eret577578.set at579.set break580581582