diff options
author | Gabe Black <gblack@eecs.umich.edu> | 2010-08-25 19:10:42 -0500 |
---|---|---|
committer | Gabe Black <gblack@eecs.umich.edu> | 2010-08-25 19:10:42 -0500 |
commit | 6368edb281f162e4fbb0a91744992a25134135f4 (patch) | |
tree | e84dfa7d10903e6c7a56e01cc6ca23f4b0d41908 /src/arch/arm/isa | |
parent | f4f6b31df1a8787a12d71108eac24543bdf541e3 (diff) | |
download | gem5-6368edb281f162e4fbb0a91744992a25134135f4.tar.xz |
ARM: Implement all ARM SIMD instructions.
Diffstat (limited to 'src/arch/arm/isa')
-rw-r--r-- | src/arch/arm/isa/decoder/thumb.isa | 2 | ||||
-rw-r--r-- | src/arch/arm/isa/formats/fp.isa | 1643 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/fp.isa | 176 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/insts.isa | 5 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/macromem.isa | 499 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/neon.isa | 3343 | ||||
-rw-r--r-- | src/arch/arm/isa/operands.isa | 26 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/macromem.isa | 192 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/mem.isa | 200 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/neon.isa | 227 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/templates.isa | 3 |
11 files changed, 5863 insertions, 453 deletions
diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa index 65ea7e30c..d0f5b8646 100644 --- a/src/arch/arm/isa/decoder/thumb.isa +++ b/src/arch/arm/isa/decoder/thumb.isa @@ -88,7 +88,7 @@ decode BIGTHUMB { 0xf: McrMrc15::mcrMrc15(); } } - 0x3: WarnUnimpl::Advanced_SIMD(); + 0x3: ThumbNeonData::ThumbNeonData(); default: decode LTCOPROC { 0xa, 0xb: ExtensionRegLoadStore::extensionRegLoadStre(); 0xf: decode HTOPCODE_9_4 { diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa index 0a5f77e6e..1482c2119 100644 --- a/src/arch/arm/isa/formats/fp.isa +++ b/src/arch/arm/isa/formats/fp.isa @@ -45,6 +45,52 @@ // Floating Point operate instructions // +output header {{ + + template<template <typename T> class Base> + StaticInstPtr + newNeonMemInst(const unsigned size, + const ExtMachInst &machInst, + const RegIndex dest, const RegIndex ra, + const uint32_t imm, const unsigned extraMemFlags) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, ra, imm, extraMemFlags); + case 1: + return new Base<uint16_t>(machInst, dest, ra, imm, extraMemFlags); + case 2: + return new Base<uint32_t>(machInst, dest, ra, imm, extraMemFlags); + case 3: + return new Base<uint64_t>(machInst, dest, ra, imm, extraMemFlags); + default: + panic("Unrecognized width %d for Neon mem inst.\n", (1 << size)); + } + } + + template<template <typename T> class Base> + StaticInstPtr + newNeonMixInst(const unsigned size, + const ExtMachInst &machInst, + const RegIndex dest, const RegIndex op1, + const uint32_t step) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, step); + case 1: + return new Base<uint16_t>(machInst, dest, op1, step); + case 2: + return new Base<uint32_t>(machInst, dest, op1, step); + case 3: + return new Base<uint64_t>(machInst, dest, op1, step); + default: + panic("Unrecognized width %d for Neon mem inst.\n", (1 << size)); + } + } + +}}; + let {{ header_output = ''' StaticInstPtr @@ -59,116 +105,233 @@ let {{ decodeNeonMem(ExtMachInst machInst) { const uint32_t b = bits(machInst, 11, 8); - const bool a = bits(machInst, 23); - const bool l = bits(machInst, 21); + const bool single = bits(machInst, 23); + const bool singleAll = single && (bits(b, 3, 2) == 3); + const bool load = bits(machInst, 21); - if (l) { - // Load instructions. - if (a) { - if (bits(b, 3, 2) != 3) { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vld1 single", machInst); - case 0x1: - return new WarnUnimplemented("vld2 single", machInst); - case 0x2: - return new WarnUnimplemented("vld3 single", machInst); - case 0x3: - return new WarnUnimplemented("vld4 single", machInst); - } - } else { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vld1 single all", - machInst); - case 0x1: - return new WarnUnimplemented("vld2 single all", - machInst); - case 0x2: - return new WarnUnimplemented("vld3 single all", - machInst); - case 0x3: - return new WarnUnimplemented("vld4 single all", - machInst); + unsigned width = 0; + + if (single) { + width = bits(b, 1, 0) + 1; + } else { + switch (bits(b, 3, 1)) { + case 0x0: width = 4; + break; + case 0x1: width = (b & 0x1) ? 2 : 1; + break; + case 0x2: width = 3; + break; + case 0x3: width = 1; + break; + case 0x4: width = 2; + break; + case 0x5: + if ((b & 0x1) == 0) { + width = 1; + break; + } + // Fall through on purpose. + default: + return new Unknown(machInst); + } + } + assert(width > 0 && width <= 4); + + const RegIndex rm = (RegIndex)(uint32_t)bits(machInst, 3, 0); + const RegIndex rn = (RegIndex)(uint32_t)bits(machInst, 19, 16); + const RegIndex vd = (RegIndex)(uint32_t)(bits(machInst, 15, 12) | + bits(machInst, 22) << 4); + const uint32_t type = bits(machInst, 11, 8); + uint32_t size = 0; + uint32_t align = 0; + unsigned inc = 1; + unsigned regs = 1; + unsigned lane = 0; + if (single) { + if (singleAll) { + size = bits(machInst, 7, 6); + bool t = bits(machInst, 5); + unsigned eBytes = (1 << size); + align = (eBytes - 1) | TLB::AllowUnaligned; + if (width == 1) { + regs = t ? 2 : 1; + inc = 1; + } else { + regs = width; + inc = t ? 2 : 1; + } + switch (width) { + case 1: + case 2: + if (bits(machInst, 4)) + align = width * eBytes - 1; + break; + case 3: + break; + case 4: + if (size == 3) { + if (bits(machInst, 4) == 0) + return new Unknown(machInst); + size = 2; + align = 0xf; + } else if (size == 2) { + if (bits(machInst, 4)) + align = 7; + } else { + if (bits(machInst, 4)) + align = 4 * eBytes - 1; } + break; } } else { - switch (bits(b, 3, 1)) { - case 0x0: - return new WarnUnimplemented("vld4 multiple", machInst); - case 0x2: - return new WarnUnimplemented("vld3 multiple", machInst); - case 0x3: - return new WarnUnimplemented("vld1 multiple", machInst); - case 0x4: - return new WarnUnimplemented("vld2 multiple", machInst); - case 0x1: - if (b & 0x1) { - return new WarnUnimplemented("vld2 multiple", machInst); - } else { - return new WarnUnimplemented("vld1 multiple", machInst); + size = bits(machInst, 11, 10); + unsigned eBytes = (1 << size); + align = (eBytes - 1) | TLB::AllowUnaligned; + regs = width; + unsigned indexAlign = bits(machInst, 7, 4); + // If width is 1, inc is always 1. That's overridden later. + switch (size) { + case 0: + inc = 1; + lane = bits(indexAlign, 3, 1); + break; + case 1: + inc = bits(indexAlign, 1) ? 2 : 1; + lane = bits(indexAlign, 3, 2); + break; + case 2: + inc = bits(indexAlign, 2) ? 2 : 1; + lane = bits(indexAlign, 3); + break; + } + // Override inc for width of 1. + if (width == 1) { + inc = 1; + } + switch (width) { + case 1: + switch (size) { + case 0: + break; + case 1: + if (bits(indexAlign, 0)) + align = 1; + break; + case 2: + if (bits(indexAlign, 1, 0)) + align = 3; + break; } - case 0x5: - if ((b & 0x1) == 0) { - return new WarnUnimplemented("vld1 multiple", machInst); - } else { + break; + case 2: + if (bits(indexAlign, 0)) + align = (2 * eBytes) - 1; + break; + case 3: + break; + case 4: + switch (size) { + case 0: + case 1: + if (bits(indexAlign, 0)) + align = (4 * eBytes) - 1; + break; + case 2: + if (bits(indexAlign, 0)) + align = (4 << bits(indexAlign, 1, 0)) - 1; break; } + break; } } + if (size == 0x3) { + return new Unknown(machInst); + } } else { - // Store instructions. - if (a) { - if (bits(b, 3, 2) != 3) { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vst1 single", machInst); - case 0x1: - return new WarnUnimplemented("vst2 single", machInst); - case 0x2: - return new WarnUnimplemented("vst3 single", machInst); - case 0x3: - return new WarnUnimplemented("vst4 single", machInst); - } - } else { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vst1 single all", - machInst); - case 0x1: - return new WarnUnimplemented("vst2 single all", - machInst); - case 0x2: - return new WarnUnimplemented("vst3 single all", - machInst); - case 0x3: - return new WarnUnimplemented("vst4 single all", - machInst); - } + size = bits(machInst, 7, 6); + align = bits(machInst, 5, 4); + if (align == 0) { + // @align wasn't specified, so alignment can be turned off. + align = ((1 << size) - 1) | TLB::AllowUnaligned; + } else { + align = ((4 << align) - 1); + } + switch (width) { + case 1: + switch (type) { + case 0x7: regs = 1; + break; + case 0xa: regs = 2; + break; + case 0x6: regs = 3; + break; + case 0x2: regs = 4; + break; + default: + return new Unknown(machInst); } + break; + case 2: + // Regs doesn't behave exactly as it does in the manual + // because they loop over regs registers twice and we break + // it down in the macroop. + switch (type) { + case 0x8: regs = 2; inc = 1; + break; + case 0x9: regs = 2; inc = 2; + break; + case 0x3: regs = 4; inc = 2; + break; + default: + return new Unknown(machInst); + } + break; + case 3: + regs = 3; + switch (type) { + case 0x4: inc = 1; + break; + case 0x5: inc = 2;; + break; + default: + return new Unknown(machInst); + } + break; + case 4: + regs = 4; + switch (type) { + case 0: inc = 1; + break; + case 1: inc = 2; + break; + default: + return new Unknown(machInst); + } + break; + } + } + + if (load) { + // Load instructions. + if (single) { + return new VldSingle(machInst, singleAll, width, rn, vd, + regs, inc, size, align, rm, lane); } else { - switch (bits(b, 3, 1)) { - case 0x0: - return new WarnUnimplemented("vst4 multiple", machInst); - case 0x2: - return new WarnUnimplemented("vst3 multiple", machInst); - case 0x3: - return new WarnUnimplemented("vst1 multiple", machInst); - case 0x4: - return new WarnUnimplemented("vst2 multiple", machInst); - case 0x1: - if (b & 0x1) { - return new WarnUnimplemented("vst2 multiple", machInst); - } else { - return new WarnUnimplemented("vst1 multiple", machInst); - } - case 0x5: - if ((b & 0x1) == 0) { - return new WarnUnimplemented("vst1 multiple", machInst); - } else { - break; - } + return new VldMult(machInst, width, rn, vd, + regs, inc, size, align, rm); + } + } else { + // Store instructions. + if (single) { + if (singleAll) { + return new Unknown(machInst); + } else { + return new VstSingle(machInst, false, width, rn, vd, + regs, inc, size, align, rm, lane); } + } else { + return new VstMult(machInst, width, rn, vd, + regs, inc, size, align, rm); } } return new Unknown(machInst); @@ -183,153 +346,243 @@ let {{ const uint32_t a = bits(machInst, 11, 8); const bool b = bits(machInst, 4); const uint32_t c = bits(machInst, 21, 20); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 21, 20); + const bool q = bits(machInst, 6); + if (q && ((vd & 0x1) || (vn & 0x1) || (vm & 0x1))) + return new Unknown(machInst); switch (a) { case 0x0: if (b) { - if (bits(machInst, 9) == 0) { - return new WarnUnimplemented("vhadd", machInst); + if (u) { + return decodeNeonUThreeReg<VqaddUD, VqaddUQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vhsub", machInst); + return decodeNeonSThreeReg<VqaddSD, VqaddSQ>( + q, size, machInst, vd, vn, vm); } } else { - return new WarnUnimplemented("vqadd", machInst); + if (size == 3) + return new Unknown(machInst); + return decodeNeonUSThreeReg<VhaddD, VhaddQ>( + q, u, size, machInst, vd, vn, vm); } case 0x1: if (!b) { - return new WarnUnimplemented("vrhadd", machInst); + return decodeNeonUSThreeReg<VrhaddD, VrhaddQ>( + q, u, size, machInst, vd, vn, vm); } else { if (u) { switch (c) { case 0: - return new WarnUnimplemented("veor", machInst); + if (q) { + return new VeorQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VeorD<uint64_t>(machInst, vd, vn, vm); + } case 1: - return new WarnUnimplemented("vbsl", machInst); + if (q) { + return new VbslQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbslD<uint64_t>(machInst, vd, vn, vm); + } case 2: - return new WarnUnimplemented("vbit", machInst); + if (q) { + return new VbitQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbitD<uint64_t>(machInst, vd, vn, vm); + } case 3: - return new WarnUnimplemented("vbif", machInst); + if (q) { + return new VbifQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbifD<uint64_t>(machInst, vd, vn, vm); + } } } else { switch (c) { case 0: - return new WarnUnimplemented("vand (reg)", machInst); + if (q) { + return new VandQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VandD<uint64_t>(machInst, vd, vn, vm); + } case 1: - return new WarnUnimplemented("vbic (reg)", machInst); + if (q) { + return new VbicQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbicD<uint64_t>(machInst, vd, vn, vm); + } case 2: - { - const IntRegIndex n = (IntRegIndex)( - (uint32_t)bits(machInst, 19, 16) | - (uint32_t)(bits(machInst, 7) << 4)); - const IntRegIndex m = (IntRegIndex)( - (uint32_t)bits(machInst, 3, 0) | - (uint32_t)(bits(machInst, 5) << 4)); - if (n == m) { - return new WarnUnimplemented("vmov (reg)", - machInst); + if (vn == vm) { + if (q) { + return new VmovQ<uint64_t>( + machInst, vd, vn, vm); + } else { + return new VmovD<uint64_t>( + machInst, vd, vn, vm); + } + } else { + if (q) { + return new VorrQ<uint64_t>( + machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vorr (reg)", - machInst); + return new VorrD<uint64_t>( + machInst, vd, vn, vm); } } case 3: - return new WarnUnimplemented("vorn (reg)", machInst); + if (q) { + return new VornQ<uint64_t>( + machInst, vd, vn, vm); + } else { + return new VornD<uint64_t>( + machInst, vd, vn, vm); + } } } } case 0x2: if (b) { - return new WarnUnimplemented("vqsub", machInst); - } else { - if (bits(machInst, 9) == 0) { - return new WarnUnimplemented("vhadd", machInst); + if (u) { + return decodeNeonUThreeReg<VqsubUD, VqsubUQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vhsub", machInst); + return decodeNeonSThreeReg<VqsubSD, VqsubSQ>( + q, size, machInst, vd, vn, vm); } + } else { + if (size == 3) + return new Unknown(machInst); + return decodeNeonUSThreeReg<VhsubD, VhsubQ>( + q, u, size, machInst, vd, vn, vm); } case 0x3: if (b) { - return new WarnUnimplemented("vcge (reg)", machInst); + return decodeNeonUSThreeReg<VcgeD, VcgeQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vcgt (reg)", machInst); + return decodeNeonUSThreeReg<VcgtD, VcgtQ>( + q, u, size, machInst, vd, vn, vm); } case 0x4: if (b) { - return new WarnUnimplemented("vqshl (reg)", machInst); + if (u) { + return decodeNeonUThreeReg<VqshlUD, VqshlUQ>( + q, size, machInst, vd, vm, vn); + } else { + return decodeNeonSThreeReg<VqshlSD, VqshlSQ>( + q, size, machInst, vd, vm, vn); + } } else { - return new WarnUnimplemented("vshl (reg)", machInst); + return decodeNeonUSThreeReg<VshlD, VshlQ>( + q, u, size, machInst, vd, vm, vn); } case 0x5: if (b) { - return new WarnUnimplemented("vqrshl", machInst); + if (u) { + return decodeNeonUThreeReg<VqrshlUD, VqrshlUQ>( + q, size, machInst, vd, vm, vn); + } else { + return decodeNeonSThreeReg<VqrshlSD, VqrshlSQ>( + q, size, machInst, vd, vm, vn); + } } else { - return new WarnUnimplemented("vrshl", machInst); + return decodeNeonUSThreeReg<VrshlD, VrshlQ>( + q, u, size, machInst, vd, vm, vn); } case 0x6: if (b) { - return new WarnUnimplemented("vmin (int)", machInst); + return decodeNeonUSThreeReg<VminD, VminQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmax (int)", machInst); + return decodeNeonUSThreeReg<VmaxD, VmaxQ>( + q, u, size, machInst, vd, vn, vm); } case 0x7: if (b) { - return new WarnUnimplemented("vaba", machInst); + return decodeNeonUSThreeReg<VabaD, VabaQ>( + q, u, size, machInst, vd, vn, vm); } else { if (bits(machInst, 23) == 1) { - if (bits(machInst, 6) == 1) { + if (q) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vabdl (int)", machInst); + return decodeNeonUSThreeUSReg<Vabdl>( + u, size, machInst, vd, vn, vm); } } else { - return new WarnUnimplemented("vabd (int)", machInst); + return decodeNeonUSThreeReg<VabdD, VabdQ>( + q, u, size, machInst, vd, vn, vm); } } case 0x8: if (b) { if (u) { - return new WarnUnimplemented("vceq (reg)", machInst); + return decodeNeonUThreeReg<VceqD, VceqQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vtst", machInst); + return decodeNeonUThreeReg<VtstD, VtstQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vsub (int)", machInst); + return decodeNeonUThreeReg<NVsubD, NVsubQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vadd (int)", machInst); + return decodeNeonUThreeReg<NVaddD, NVaddQ>( + q, size, machInst, vd, vn, vm); } } case 0x9: if (b) { if (u) { - return new WarnUnimplemented("vmul (poly)", machInst); + return decodeNeonUThreeReg<NVmulpD, NVmulpQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmul (int)", machInst); + return decodeNeonSThreeReg<NVmulD, NVmulQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vmls (int)", machInst); + return decodeNeonUSThreeReg<NVmlsD, NVmlsQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmla (int)", machInst); + return decodeNeonUSThreeReg<NVmlaD, NVmlaQ>( + q, u, size, machInst, vd, vn, vm); } } case 0xa: if (b) { - return new WarnUnimplemented("vpmin (int)", machInst); + return decodeNeonUSThreeReg<VpminD, VpminQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vpmax (int)", machInst); + return decodeNeonUSThreeReg<VpmaxD, VpmaxQ>( + q, u, size, machInst, vd, vn, vm); } case 0xb: if (b) { if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vpadd (int)", machInst); + return decodeNeonUThreeReg<NVpaddD, NVpaddQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vqrdmulh", machInst); + return decodeNeonSThreeSReg<VqrdmulhD, VqrdmulhQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vqdmulh", machInst); + return decodeNeonSThreeSReg<VqdmulhD, VqdmulhQ>( + q, size, machInst, vd, vn, vm); } } case 0xc: @@ -338,29 +591,57 @@ let {{ if (b) { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmul (fp)", machInst); + if (q) { + return new NVmulQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmulDFp<float>(machInst, vd, vn, vm); + } } else { return new Unknown(machInst); } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmla (fp)", machInst); + if (q) { + return new NVmlaQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmlaDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vmls (fp)", machInst); + if (q) { + return new NVmlsQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmlsDFp<float>(machInst, vd, vn, vm); + } } } } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vpadd (fp)", machInst); + if (q) { + return new VpaddQFp<float>(machInst, vd, vn, vm); + } else { + return new VpaddDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vabd (fp)", machInst); + if (q) { + return new VabdQFp<float>(machInst, vd, vn, vm); + } else { + return new VabdDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vadd (fp)", machInst); + if (q) { + return new VaddQFp<float>(machInst, vd, vn, vm); + } else { + return new VaddDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vsub (fp)", machInst); + if (q) { + return new VsubQFp<float>(machInst, vd, vn, vm); + } else { + return new VsubDFp<float>(machInst, vd, vn, vm); + } } } } @@ -368,9 +649,17 @@ let {{ if (b) { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vacge", machInst); + if (q) { + return new VacgeQFp<float>(machInst, vd, vn, vm); + } else { + return new VacgeDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vacgt", machInst); + if (q) { + return new VacgtQFp<float>(machInst, vd, vn, vm); + } else { + return new VacgtDFp<float>(machInst, vd, vn, vm); + } } } else { return new Unknown(machInst); @@ -378,13 +667,25 @@ let {{ } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vcge (reg)", machInst); + if (q) { + return new VcgeQFp<float>(machInst, vd, vn, vm); + } else { + return new VcgeDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vcgt (reg)", machInst); + if (q) { + return new VcgtQFp<float>(machInst, vd, vn, vm); + } else { + return new VcgtDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vceq (reg)", machInst); + if (q) { + return new VceqQFp<float>(machInst, vd, vn, vm); + } else { + return new VceqDFp<float>(machInst, vd, vn, vm); + } } else { return new Unknown(machInst); } @@ -396,23 +697,47 @@ let {{ return new Unknown(machInst); } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vrecps", machInst); + if (q) { + return new VrecpsQFp<float>(machInst, vd, vn, vm); + } else { + return new VrecpsDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vrsqrts", machInst); + if (q) { + return new VrsqrtsQFp<float>(machInst, vd, vn, vm); + } else { + return new VrsqrtsDFp<float>(machInst, vd, vn, vm); + } } } } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vpmax (fp)", machInst); + if (q) { + return new VpmaxQFp<float>(machInst, vd, vn, vm); + } else { + return new VpmaxDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vpmin (fp)", machInst); + if (q) { + return new VpminQFp<float>(machInst, vd, vn, vm); + } else { + return new VpminDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmax (fp)", machInst); + if (q) { + return new VmaxQFp<float>(machInst, vd, vn, vm); + } else { + return new VmaxDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vmin (fp)", machInst); + if (q) { + return new VminQFp<float>(machInst, vd, vn, vm); + } else { + return new VminDFp<float>(machInst, vd, vn, vm); + } } } } @@ -423,50 +748,94 @@ let {{ static StaticInstPtr decodeNeonOneRegModImm(ExtMachInst machInst) { + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const bool q = bits(machInst, 6); const bool op = bits(machInst, 5); - const uint32_t cmode = bits(machInst, 11, 8); + const uint8_t cmode = bits(machInst, 11, 8); + const uint8_t imm = ((THUMB ? bits(machInst, 28) : + bits(machInst, 24)) << 7) | + (bits(machInst, 18, 16) << 4) | + (bits(machInst, 3, 0) << 0); + const uint64_t bigImm = simd_modified_imm(op, cmode, imm); if (op) { if (bits(cmode, 3) == 0) { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vorr (imm)", machInst); + if (q) + return new NVbiciQ<uint64_t>(machInst, vd, bigImm); + else + return new NVbiciD<uint64_t>(machInst, vd, bigImm); } } else { if (bits(cmode, 2) == 1) { - return new WarnUnimplemented("vmov (imm)", machInst); + switch (bits(cmode, 1, 0)) { + case 0: + case 1: + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); + case 2: + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); + case 3: + if (q) + return new Unknown(machInst); + else + return new Unknown(machInst); + } } else { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vorr (imm)", machInst); + if (q) + return new NVbiciQ<uint64_t>(machInst, vd, bigImm); + else + return new NVbiciD<uint64_t>(machInst, vd, bigImm); } } } } else { if (bits(cmode, 3) == 0) { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmvn (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vbic (imm)", machInst); + if (q) + return new NVorriQ<uint64_t>(machInst, vd, bigImm); + else + return new NVorriD<uint64_t>(machInst, vd, bigImm); } } else { if (bits(cmode, 2) == 1) { - switch (bits(cmode, 1, 0)) { - case 0: - case 1: - return new WarnUnimplemented("vmvn (imm)", machInst); - case 2: - return new WarnUnimplemented("vmov (imm)", machInst); - case 3: - return new Unknown(machInst); - } - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmvn (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vbic (imm)", machInst); + if (q) + return new NVorriQ<uint64_t>(machInst, vd, bigImm); + else + return new NVorriD<uint64_t>(machInst, vd, bigImm); } } } @@ -481,70 +850,149 @@ let {{ const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const bool b = bits(machInst, 6); const bool l = bits(machInst, 7); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + unsigned imm6 = bits(machInst, 21, 16); + unsigned imm = ((l ? 1 : 0) << 6) | imm6; + unsigned size = 3; + unsigned lShiftAmt = 0; + unsigned bitSel; + for (bitSel = 1 << 6; true; bitSel >>= 1) { + if (bitSel & imm) + break; + else if (!size) + return new Unknown(machInst); + size--; + } + lShiftAmt = imm6 & ~bitSel; + unsigned rShiftAmt = 0; + if (a != 0xe && a != 0xf) { + if (size > 2) + rShiftAmt = 64 - imm6; + else + rShiftAmt = 2 * (8 << size) - imm6; + } switch (a) { case 0x0: - return new WarnUnimplemented("vshr", machInst); + return decodeNeonUSTwoShiftReg<NVshrD, NVshrQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x1: - return new WarnUnimplemented("vsra", machInst); + return decodeNeonUSTwoShiftReg<NVsraD, NVsraQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x2: - return new WarnUnimplemented("vrshr", machInst); + return decodeNeonUSTwoShiftReg<NVrshrD, NVrshrQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x3: - return new WarnUnimplemented("vrsra", machInst); + return decodeNeonUSTwoShiftReg<NVrsraD, NVrsraQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x4: if (u) { - return new WarnUnimplemented("vsri", machInst); + return decodeNeonUTwoShiftReg<NVsriD, NVsriQ>( + b, size, machInst, vd, vm, rShiftAmt); } else { return new Unknown(machInst); } case 0x5: if (u) { - return new WarnUnimplemented("vsli", machInst); + return decodeNeonUTwoShiftReg<NVsliD, NVsliQ>( + b, size, machInst, vd, vm, lShiftAmt); } else { - return new WarnUnimplemented("vshl (imm)", machInst); + return decodeNeonUTwoShiftReg<NVshlD, NVshlQ>( + b, size, machInst, vd, vm, lShiftAmt); } case 0x6: case 0x7: - return new WarnUnimplemented("vqshl, vqshlu (imm)", machInst); + if (u) { + if (a == 0x6) { + return decodeNeonSTwoShiftReg<NVqshlusD, NVqshlusQ>( + b, size, machInst, vd, vm, lShiftAmt); + } else { + return decodeNeonUTwoShiftReg<NVqshluD, NVqshluQ>( + b, size, machInst, vd, vm, lShiftAmt); + } + } else { + return decodeNeonSTwoShiftReg<NVqshlD, NVqshlQ>( + b, size, machInst, vd, vm, lShiftAmt); + } case 0x8: if (l) { return new Unknown(machInst); } else if (u) { - if (b) { - return new WarnUnimplemented("vqrshrn, vqrshrun", machInst); - } else { - return new WarnUnimplemented("vqshrn, vqshrun", machInst); - } + return decodeNeonSTwoShiftSReg<NVqshruns, NVqrshruns>( + b, size, machInst, vd, vm, rShiftAmt); } else { - if (b) { - return new WarnUnimplemented("vrshrn", machInst); - } else { - return new WarnUnimplemented("vshrn", machInst); - } + return decodeNeonUTwoShiftSReg<NVshrn, NVrshrn>( + b, size, machInst, vd, vm, rShiftAmt); } case 0x9: if (l) { return new Unknown(machInst); - } else if (b) { - return new WarnUnimplemented("vqrshrn, vqrshrun", machInst); + } else if (u) { + return decodeNeonUTwoShiftSReg<NVqshrun, NVqrshrun>( + b, size, machInst, vd, vm, rShiftAmt); } else { - return new WarnUnimplemented("vqshrn, vqshrun", machInst); + return decodeNeonSTwoShiftSReg<NVqshrn, NVqrshrn>( + b, size, machInst, vd, vm, rShiftAmt); } case 0xa: if (l || b) { return new Unknown(machInst); } else { - // If the shift amount is zero, it's vmovl. - return new WarnUnimplemented("vshll, vmovl", machInst); + return decodeNeonUSTwoShiftSReg<NVmovl, NVshll>( + lShiftAmt, u, size, machInst, vd, vm, lShiftAmt); } case 0xe: + if (l) { + return new Unknown(machInst); + } else { + if (bits(imm6, 5) == 0) + return new Unknown(machInst); + if (u) { + if (b) { + return new NVcvtu2fpQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvtu2fpD<float>( + machInst, vd, vm, 64 - imm6); + } + } else { + if (b) { + return new NVcvts2fpQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvts2fpD<float>( + machInst, vd, vm, 64 - imm6); + } + } + } case 0xf: if (l) { return new Unknown(machInst); - } else if (a == 0xe) { - return new WarnUnimplemented("vcvt (fixed to fp)", machInst); - } else if (a == 0xf) { - return new WarnUnimplemented("vcvt (fp to fixed)", machInst); + } else { + if (bits(imm6, 5) == 0) + return new Unknown(machInst); + if (u) { + if (b) { + return new NVcvt2ufxQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvt2ufxD<float>( + machInst, vd, vm, 64 - imm6); + } + } else { + if (b) { + return new NVcvt2sfxQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvt2sfxD<float>( + machInst, vd, vm, 64 - imm6); + } + } } } return new Unknown(machInst); @@ -555,74 +1003,89 @@ let {{ { const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const uint32_t a = bits(machInst, 11, 8); - + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 21, 20); switch (a) { case 0x0: - return new WarnUnimplemented("vaddl", machInst); + return decodeNeonUSThreeUSReg<Vaddl>( + u, size, machInst, vd, vn, vm); case 0x1: - return new WarnUnimplemented("vaddw", machInst); + return decodeNeonUSThreeUSReg<Vaddw>( + u, size, machInst, vd, vn, vm); case 0x2: - return new WarnUnimplemented("vsubl", machInst); + return decodeNeonUSThreeUSReg<Vsubl>( + u, size, machInst, vd, vn, vm); case 0x3: - return new WarnUnimplemented("vsubw", machInst); + return decodeNeonUSThreeUSReg<Vsubw>( + u, size, machInst, vd, vn, vm); case 0x4: if (u) { - return new WarnUnimplemented("vraddhn", machInst); + return decodeNeonUThreeUSReg<Vraddhn>( + size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vaddhn", machInst); + return decodeNeonUThreeUSReg<Vaddhn>( + size, machInst, vd, vn, vm); } case 0x5: - return new WarnUnimplemented("vabal", machInst); + return decodeNeonUSThreeUSReg<Vabal>( + u, size, machInst, vd, vn, vm); case 0x6: if (u) { - return new WarnUnimplemented("vrsubhn", machInst); + return decodeNeonUThreeUSReg<Vrsubhn>( + size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vsubhn", machInst); + return decodeNeonUThreeUSReg<Vsubhn>( + size, machInst, vd, vn, vm); } case 0x7: if (bits(machInst, 23)) { - return new WarnUnimplemented("vabdl (int)", machInst); + return decodeNeonUSThreeUSReg<Vabdl>( + u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vabd (int)", machInst); + return decodeNeonUSThreeReg<VabdD, VabdQ>( + bits(machInst, 6), u, size, machInst, vd, vn, vm); } case 0x8: - return new WarnUnimplemented("vmlal (int)", machInst); + return decodeNeonUSThreeUSReg<Vmlal>( + u, size, machInst, vd, vn, vm); case 0xa: - return new WarnUnimplemented("vmlsl (int)", machInst); + return decodeNeonUSThreeUSReg<Vmlsl>( + u, size, machInst, vd, vn, vm); case 0x9: - if (bits(machInst, 23) == 0) { - if (bits(machInst, 4) == 0) { - if (u) { - return new WarnUnimplemented("vmls (int)", machInst); - } else { - return new WarnUnimplemented("vmla (int)", machInst); - } - } else { - if (u) { - return new WarnUnimplemented("vmul (poly)", machInst); - } else { - return new WarnUnimplemented("vmul (int)", machInst); - } - } + if (u) { + return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlal", machInst); + return decodeNeonSThreeUSReg<Vqdmlal>( + size, machInst, vd, vn, vm); } case 0xb: - if (!u) { + if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlsl", machInst); + return decodeNeonSThreeUSReg<Vqdmlsl>( + size, machInst, vd, vn, vm); } case 0xc: - return new WarnUnimplemented("vmull (int)", machInst); + return decodeNeonUSThreeUSReg<Vmull>( + u, size, machInst, vd, vn, vm); case 0xd: - if (!u) { + if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmull", machInst); + return decodeNeonSThreeUSReg<Vqdmull>( + size, machInst, vd, vn, vm); } case 0xe: - return new WarnUnimplemented("vmull (poly)", machInst); + return decodeNeonUThreeUSReg<Vmullp>( + size, machInst, vd, vn, vm); } return new Unknown(machInst); } @@ -632,48 +1095,256 @@ let {{ { const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const uint32_t a = bits(machInst, 11, 8); - + const unsigned size = bits(machInst, 21, 20); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = (size == 2) ? + (IntRegIndex)(2 * bits(machInst, 3, 0)) : + (IntRegIndex)(2 * bits(machInst, 2, 0)); + const unsigned index = (size == 2) ? (unsigned)bits(machInst, 5) : + (bits(machInst, 3) | (bits(machInst, 5) << 1)); switch (a) { case 0x0: - return new WarnUnimplemented("vmla (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmlasQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlasQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmlasD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlasD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x1: - return new WarnUnimplemented("vmla (fp scalar)", machInst); + if (u) + return new VmlasQFp<float>(machInst, vd, vn, vm, index); + else + return new VmlasDFp<float>(machInst, vd, vn, vm, index); case 0x4: - return new WarnUnimplemented("vmls (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmlssQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlssQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmlssD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlssD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x5: - return new WarnUnimplemented("vmls (fp scalar)", machInst); + if (u) + return new VmlssQFp<float>(machInst, vd, vn, vm, index); + else + return new VmlssDFp<float>(machInst, vd, vn, vm, index); case 0x2: - return new WarnUnimplemented("vmlal (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmlals<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlals<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmlals<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlals<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x6: - return new WarnUnimplemented("vmlsl (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmlsls<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlsls<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmlsls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlsls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x3: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlal", machInst); + switch (size) { + case 1: + return new Vqdmlals<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vqdmlals<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } } case 0x7: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlsl", machInst); + switch (size) { + case 1: + return new Vqdmlsls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vqdmlsls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } } case 0x8: - return new WarnUnimplemented("vmul (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmulsQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmulsQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmulsD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmulsD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x9: - return new WarnUnimplemented("vmul (fp scalar)", machInst); + if (u) + return new VmulsQFp<float>(machInst, vd, vn, vm, index); + else + return new VmulsDFp<float>(machInst, vd, vn, vm, index); case 0xa: - return new WarnUnimplemented("vmull (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmulls<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmulls<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmulls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmulls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0xb: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmull", machInst); + if (u) { + switch (size) { + case 1: + return new Vqdmulls<uint16_t>( + machInst, vd, vn, vm, index); + case 2: + return new Vqdmulls<uint32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vqdmulls<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new Vqdmulls<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } } case 0xc: - return new WarnUnimplemented("vqdmulh", machInst); + if (u) { + switch (size) { + case 1: + return new VqdmulhsQ<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqdmulhsQ<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VqdmulhsD<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqdmulhsD<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0xd: - return new WarnUnimplemented("vqrdmulh", machInst); + if (u) { + switch (size) { + case 1: + return new VqrdmulhsQ<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqrdmulhsQ<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VqrdmulhsD<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqrdmulhsD<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } } return new Unknown(machInst); } @@ -683,85 +1354,234 @@ let {{ { const uint32_t a = bits(machInst, 17, 16); const uint32_t b = bits(machInst, 10, 6); + const bool q = bits(machInst, 6); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 19, 18); switch (a) { case 0x0: switch (bits(b, 4, 1)) { case 0x0: - return new WarnUnimplemented("vrev64", machInst); + switch (size) { + case 0: + if (q) { + return new NVrev64Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint8_t>(machInst, vd, vm); + } + case 1: + if (q) { + return new NVrev64Q<uint16_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint16_t>(machInst, vd, vm); + } + case 2: + if (q) { + return new NVrev64Q<uint32_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint32_t>(machInst, vd, vm); + } + default: + return new Unknown(machInst); + } case 0x1: - return new WarnUnimplemented("vrev32", machInst); + switch (size) { + case 0: + if (q) { + return new NVrev32Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev32D<uint8_t>(machInst, vd, vm); + } + case 1: + if (q) { + return new NVrev32Q<uint16_t>(machInst, vd, vm); + } else { + return new NVrev32D<uint16_t>(machInst, vd, vm); + } + default: + return new Unknown(machInst); + } case 0x2: - return new WarnUnimplemented("vrev16", machInst); + if (size != 0) { + return new Unknown(machInst); + } else if (q) { + return new NVrev16Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev16D<uint8_t>(machInst, vd, vm); + } case 0x4: + return decodeNeonSTwoMiscSReg<NVpaddlD, NVpaddlQ>( + q, size, machInst, vd, vm); case 0x5: - return new WarnUnimplemented("vpaddl", machInst); + return decodeNeonUTwoMiscSReg<NVpaddlD, NVpaddlQ>( + q, size, machInst, vd, vm); case 0x8: - return new WarnUnimplemented("vcls", machInst); + return decodeNeonSTwoMiscReg<NVclsD, NVclsQ>( + q, size, machInst, vd, vm); case 0x9: - return new WarnUnimplemented("vclz", machInst); + return decodeNeonSTwoMiscReg<NVclzD, NVclzQ>( + q, size, machInst, vd, vm); case 0xa: - return new WarnUnimplemented("vcnt", machInst); + return decodeNeonUTwoMiscReg<NVcntD, NVcntQ>( + q, size, machInst, vd, vm); case 0xb: - return new WarnUnimplemented("vmvn (reg)", machInst); + if (q) + return new NVmvnQ<uint64_t>(machInst, vd, vm); + else + return new NVmvnD<uint64_t>(machInst, vd, vm); case 0xc: + return decodeNeonSTwoMiscSReg<NVpadalD, NVpadalQ>( + q, size, machInst, vd, vm); case 0xd: - return new WarnUnimplemented("vpadal", machInst); + return decodeNeonUTwoMiscSReg<NVpadalD, NVpadalQ>( + q, size, machInst, vd, vm); case 0xe: - return new WarnUnimplemented("vqabs", machInst); + return decodeNeonSTwoMiscReg<NVqabsD, NVqabsQ>( + q, size, machInst, vd, vm); case 0xf: - return new WarnUnimplemented("vqneg", machInst); + return decodeNeonSTwoMiscReg<NVqnegD, NVqnegQ>( + q, size, machInst, vd, vm); default: return new Unknown(machInst); } case 0x1: switch (bits(b, 3, 1)) { case 0x0: - return new WarnUnimplemented("vcgt (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcgtQFp<float>(machInst, vd, vm); + } else { + return new NVcgtDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcgtD, NVcgtQ>( + q, size, machInst, vd, vm); + } case 0x1: - return new WarnUnimplemented("vcge (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcgeQFp<float>(machInst, vd, vm); + } else { + return new NVcgeDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcgeD, NVcgeQ>( + q, size, machInst, vd, vm); + } case 0x2: - return new WarnUnimplemented("vceq (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVceqQFp<float>(machInst, vd, vm); + } else { + return new NVceqDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVceqD, NVceqQ>( + q, size, machInst, vd, vm); + } case 0x3: - return new WarnUnimplemented("vcle (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcleQFp<float>(machInst, vd, vm); + } else { + return new NVcleDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcleD, NVcleQ>( + q, size, machInst, vd, vm); + } case 0x4: - return new WarnUnimplemented("vclt (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcltQFp<float>(machInst, vd, vm); + } else { + return new NVcltDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcltD, NVcltQ>( + q, size, machInst, vd, vm); + } case 0x6: - return new WarnUnimplemented("vabs (imm #0)", machInst); + if (bits(machInst, 10)) { + if (q) + return new NVabsQFp<float>(machInst, vd, vm); + else + return new NVabsDFp<float>(machInst, vd, vm); + } else { + return decodeNeonSTwoMiscReg<NVabsD, NVabsQ>( + q, size, machInst, vd, vm); + } case 0x7: - return new WarnUnimplemented("vneg (imm #0)", machInst); + if (bits(machInst, 10)) { + if (q) + return new NVnegQFp<float>(machInst, vd, vm); + else + return new NVnegDFp<float>(machInst, vd, vm); + } else { + return decodeNeonSTwoMiscReg<NVnegD, NVnegQ>( + q, size, machInst, vd, vm); + } } case 0x2: switch (bits(b, 4, 1)) { case 0x0: - return new WarnUnimplemented("vswp", machInst); + if (q) + return new NVswpQ<uint64_t>(machInst, vd, vm); + else + return new NVswpD<uint64_t>(machInst, vd, vm); case 0x1: - return new WarnUnimplemented("vtrn", machInst); + return decodeNeonUTwoMiscReg<NVtrnD, NVtrnQ>( + q, size, machInst, vd, vm); case 0x2: - return new WarnUnimplemented("vuzp", machInst); + return decodeNeonUTwoMiscReg<NVuzpD, NVuzpQ>( + q, size, machInst, vd, vm); case 0x3: - return new WarnUnimplemented("vzip", machInst); + return decodeNeonUTwoMiscReg<NVzipD, NVzipQ>( + q, size, machInst, vd, vm); case 0x4: if (b == 0x8) { - return new WarnUnimplemented("vmovn", machInst); + return decodeNeonUTwoMiscUSReg<NVmovn>( + size, machInst, vd, vm); } else { - return new WarnUnimplemented("vqmovun", machInst); + return decodeNeonSTwoMiscUSReg<NVqmovuns>( + size, machInst, vd, vm); } case 0x5: - return new WarnUnimplemented("vqmovn", machInst); + if (q) { + return decodeNeonUTwoMiscUSReg<NVqmovun>( + size, machInst, vd, vm); + } else { + return decodeNeonSTwoMiscUSReg<NVqmovn>( + size, machInst, vd, vm); + } case 0x6: if (b == 0xc) { - return new WarnUnimplemented("vshll", machInst); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + unsigned size = bits(machInst, 19, 18); + return decodeNeonSTwoShiftUSReg<NVshll>( + size, machInst, vd, vm, 8 << size); } else { return new Unknown(machInst); } case 0xc: case 0xe: if (b == 0x18) { - return new WarnUnimplemented("vcvt (single to half)", - machInst); + if (size != 1 || (vm % 2)) + return new Unknown(machInst); + return new NVcvts2h<uint16_t>(machInst, vd, vm); } else if (b == 0x1c) { - return new WarnUnimplemented("vcvt (half to single)", - machInst); + if (size != 1 || (vd % 2)) + return new Unknown(machInst); + return new NVcvth2s<uint16_t>(machInst, vd, vm); } else { return new Unknown(machInst); } @@ -770,11 +1590,75 @@ let {{ } case 0x3: if (bits(b, 4, 3) == 0x3) { - return new WarnUnimplemented("vcvt (fp and int)", machInst); + if ((q && (vd % 2 || vm % 2)) || size != 2) { + return new Unknown(machInst); + } else { + if (bits(b, 2)) { + if (bits(b, 1)) { + if (q) { + return new NVcvt2ufxQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvt2ufxD<float>( + machInst, vd, vm, 0); + } + } else { + if (q) { + return new NVcvt2sfxQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvt2sfxD<float>( + machInst, vd, vm, 0); + } + } + } else { + if (bits(b, 1)) { + if (q) { + return new NVcvtu2fpQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvtu2fpD<float>( + machInst, vd, vm, 0); + } + } else { + if (q) { + return new NVcvts2fpQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvts2fpD<float>( + machInst, vd, vm, 0); + } + } + } + } } else if ((b & 0x1a) == 0x10) { - return new WarnUnimplemented("vrecpe", machInst); + if (bits(b, 2)) { + if (q) { + return new NVrecpeQFp<float>(machInst, vd, vm); + } else { + return new NVrecpeDFp<float>(machInst, vd, vm); + } + } else { + if (q) { + return new NVrecpeQ<uint32_t>(machInst, vd, vm); + } else { + return new NVrecpeD<uint32_t>(machInst, vd, vm); + } + } } else if ((b & 0x1a) == 0x12) { - return new WarnUnimplemented("vrsqrte", machInst); + if (bits(b, 2)) { + if (q) { + return new NVrsqrteQFp<float>(machInst, vd, vm); + } else { + return new NVrsqrteDFp<float>(machInst, vd, vm); + } + } else { + if (q) { + return new NVrsqrteQ<uint32_t>(machInst, vd, vm); + } else { + return new NVrsqrteD<uint32_t>(machInst, vd, vm); + } + } } else { return new Unknown(machInst); } @@ -799,29 +1683,76 @@ let {{ } } else if ((c & 0x9) == 9) { return decodeNeonTwoRegAndShift(machInst); - } else if ((c & 0x5) == 0) { - if (bits(a, 3, 2) != 0x3) { + } else if (bits(a, 2, 1) != 0x3) { + if ((c & 0x5) == 0) { return decodeNeonThreeRegDiffLengths(machInst); - } - } else if ((c & 0x5) == 4) { - if (bits(a, 3, 2) != 0x3) { + } else if ((c & 0x5) == 4) { return decodeNeonTwoRegScalar(machInst); } } else if ((a & 0x16) == 0x16) { + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); if (!u) { if (bits(c, 0) == 0) { - return new WarnUnimplemented("vext", machInst); + unsigned imm4 = bits(machInst, 11, 8); + bool q = bits(machInst, 6); + if (imm4 >= 16 && !q) + return new Unknown(machInst); + if (q) { + return new NVextQ<uint8_t>(machInst, vd, vn, vm, imm4); + } else { + return new NVextD<uint8_t>(machInst, vd, vn, vm, imm4); + } } } else if (bits(b, 3) == 0 && bits(c, 0) == 0) { return decodeNeonTwoRegMisc(machInst); } else if (bits(b, 3, 2) == 0x2 && bits(c, 0) == 0) { + unsigned length = bits(machInst, 9, 8) + 1; + if ((uint32_t)vn / 2 + length > 32) + return new Unknown(machInst); if (bits(machInst, 6) == 0) { - return new WarnUnimplemented("vtbl", machInst); + switch (length) { + case 1: + return new NVtbl1(machInst, vd, vn, vm); + case 2: + return new NVtbl2(machInst, vd, vn, vm); + case 3: + return new NVtbl3(machInst, vd, vn, vm); + case 4: + return new NVtbl4(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vtbx", machInst); + switch (length) { + case 1: + return new NVtbx1(machInst, vd, vn, vm); + case 2: + return new NVtbx2(machInst, vd, vn, vm); + case 3: + return new NVtbx3(machInst, vd, vn, vm); + case 4: + return new NVtbx4(machInst, vd, vn, vm); + } } } else if (b == 0xc && (c & 0x9) == 0) { - return new WarnUnimplemented("vdup (scalar)", machInst); + unsigned imm4 = bits(machInst, 19, 16); + if (bits(imm4, 2, 0) == 0) + return new Unknown(machInst); + unsigned size = 0; + while ((imm4 & 0x1) == 0) { + size++; + imm4 >>= 1; + } + unsigned index = imm4 >> 1; + const bool q = bits(machInst, 6); + return decodeNeonUTwoShiftSReg<NVdupD, NVdupQ>( + q, size, machInst, vd, vm, index); } } return new Unknown(machInst); @@ -837,7 +1768,7 @@ def format ThumbNeonMem() {{ def format ThumbNeonData() {{ decode_block = ''' - return decodeNeonMem(machInst); + return decodeNeonData(machInst); ''' }}; @@ -893,7 +1824,7 @@ let {{ break; case 0x1: { - if (offset == 0 || vd + offset > NumFloatArchRegs) { + if (offset == 0 || vd + offset/2 > NumFloatArchRegs) { break; } switch (bits(opcode, 1, 0)) { @@ -1044,40 +1975,51 @@ let {{ if (bits(a, 2) == 0) { uint32_t vd = (bits(machInst, 7) << 5) | (bits(machInst, 19, 16) << 1); - uint32_t index, size; + // Handle accessing each single precision half of the vector. + vd += bits(machInst, 21); const IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 15, 12); if (bits(machInst, 22) == 1) { - size = 8; - index = (bits(machInst, 21) << 2) | - bits(machInst, 6, 5); + return new VmovCoreRegB(machInst, (IntRegIndex)vd, + rt, bits(machInst, 6, 5)); } else if (bits(machInst, 5) == 1) { - size = 16; - index = (bits(machInst, 21) << 1) | - bits(machInst, 6); + return new VmovCoreRegH(machInst, (IntRegIndex)vd, + rt, bits(machInst, 6)); } else if (bits(machInst, 6) == 0) { - size = 32; - index = bits(machInst, 21); + return new VmovCoreRegW(machInst, (IntRegIndex)vd, rt); } else { return new Unknown(machInst); } - if (index >= (32 / size)) { - index -= (32 / size); - vd++; - } - switch (size) { - case 8: - return new VmovCoreRegB(machInst, (IntRegIndex)vd, - rt, index); - case 16: - return new VmovCoreRegH(machInst, (IntRegIndex)vd, - rt, index); - case 32: - return new VmovCoreRegW(machInst, (IntRegIndex)vd, rt); - } } else if (bits(b, 1) == 0) { - // A8-594 - return new WarnUnimplemented("vdup", machInst); + bool q = bits(machInst, 21); + unsigned be = (bits(machInst, 22) << 1) | (bits(machInst, 5)); + IntRegIndex vd = (IntRegIndex)(2 * (uint32_t) + (bits(machInst, 19, 16) | (bits(machInst, 7) << 4))); + IntRegIndex rt = (IntRegIndex)(uint32_t) + bits(machInst, 15, 12); + if (q) { + switch (be) { + case 0: + return new NVdupQGpr<uint32_t>(machInst, vd, rt); + case 1: + return new NVdupQGpr<uint16_t>(machInst, vd, rt); + case 2: + return new NVdupQGpr<uint8_t>(machInst, vd, rt); + case 3: + return new Unknown(machInst); + } + } else { + switch (be) { + case 0: + return new NVdupDGpr<uint32_t>(machInst, vd, rt); + case 1: + return new NVdupDGpr<uint16_t>(machInst, vd, rt); + case 2: + return new NVdupDGpr<uint8_t>(machInst, vd, rt); + case 3: + return new Unknown(machInst); + } + } } } else if (l == 1 && c == 0) { if (a == 0) { @@ -1128,30 +2070,14 @@ let {{ } else { uint32_t vd = (bits(machInst, 7) << 5) | (bits(machInst, 19, 16) << 1); - uint32_t index, size; + // Handle indexing into each single precision half of the vector. + vd += bits(machInst, 21); + uint32_t index; const IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 15, 12); const bool u = (bits(machInst, 23) == 1); if (bits(machInst, 22) == 1) { - size = 8; - index = (bits(machInst, 21) << 2) | - bits(machInst, 6, 5); - } else if (bits(machInst, 5) == 1) { - size = 16; - index = (bits(machInst, 21) << 1) | - bits(machInst, 6); - } else if (bits(machInst, 6) == 0 && !u) { - size = 32; - index = bits(machInst, 21); - } else { - return new Unknown(machInst); - } - if (index >= (32 / size)) { - index -= (32 / size); - vd++; - } - switch (size) { - case 8: + index = bits(machInst, 6, 5); if (u) { return new VmovRegCoreUB(machInst, rt, (IntRegIndex)vd, index); @@ -1159,7 +2085,8 @@ let {{ return new VmovRegCoreSB(machInst, rt, (IntRegIndex)vd, index); } - case 16: + } else if (bits(machInst, 5) == 1) { + index = bits(machInst, 6); if (u) { return new VmovRegCoreUH(machInst, rt, (IntRegIndex)vd, index); @@ -1167,8 +2094,10 @@ let {{ return new VmovRegCoreSH(machInst, rt, (IntRegIndex)vd, index); } - case 32: + } else if (bits(machInst, 6) == 0 && !u) { return new VmovRegCoreW(machInst, rt, (IntRegIndex)vd); + } else { + return new Unknown(machInst); } } return new Unknown(machInst); diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa index c4682b66c..9748c8a49 100644 --- a/src/arch/arm/isa/insts/fp.isa +++ b/src/arch/arm/isa/insts/fp.isa @@ -282,7 +282,7 @@ let {{ exec_output += PredOpExecute.subst(vmovRegQIop); vmovCoreRegBCode = ''' - FpDest.uw = insertBits(FpDest.uw, imm * 8, imm * 8 + 7, Op1.ub); + FpDest.uw = insertBits(FpDest.uw, imm * 8 + 7, imm * 8, Op1.ub); ''' vmovCoreRegBIop = InstObjParams("vmov", "VmovCoreRegB", "FpRegRegImmOp", { "code": vmovCoreRegBCode, @@ -292,7 +292,7 @@ let {{ exec_output += PredOpExecute.subst(vmovCoreRegBIop); vmovCoreRegHCode = ''' - FpDest.uw = insertBits(FpDest.uw, imm * 16, imm * 16 + 15, Op1.uh); + FpDest.uw = insertBits(FpDest.uw, imm * 16 + 15, imm * 16, Op1.uh); ''' vmovCoreRegHIop = InstObjParams("vmov", "VmovCoreRegH", "FpRegRegImmOp", { "code": vmovCoreRegHCode, @@ -312,7 +312,8 @@ let {{ exec_output += PredOpExecute.subst(vmovCoreRegWIop); vmovRegCoreUBCode = ''' - Dest = bits(FpOp1.uw, imm * 8, imm * 8 + 7); + assert(imm < 4); + Dest = bits(FpOp1.uw, imm * 8 + 7, imm * 8); ''' vmovRegCoreUBIop = InstObjParams("vmov", "VmovRegCoreUB", "FpRegRegImmOp", { "code": vmovRegCoreUBCode, @@ -322,7 +323,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreUBIop); vmovRegCoreUHCode = ''' - Dest = bits(FpOp1.uw, imm * 16, imm * 16 + 15); + assert(imm < 2); + Dest = bits(FpOp1.uw, imm * 16 + 15, imm * 16); ''' vmovRegCoreUHIop = InstObjParams("vmov", "VmovRegCoreUH", "FpRegRegImmOp", { "code": vmovRegCoreUHCode, @@ -332,7 +334,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreUHIop); vmovRegCoreSBCode = ''' - Dest = sext<8>(bits(FpOp1.uw, imm * 8, imm * 8 + 7)); + assert(imm < 4); + Dest = sext<8>(bits(FpOp1.uw, imm * 8 + 7, imm * 8)); ''' vmovRegCoreSBIop = InstObjParams("vmov", "VmovRegCoreSB", "FpRegRegImmOp", { "code": vmovRegCoreSBCode, @@ -342,7 +345,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreSBIop); vmovRegCoreSHCode = ''' - Dest = sext<16>(bits(FpOp1.uw, imm * 16, imm * 16 + 15)); + assert(imm < 2); + Dest = sext<16>(bits(FpOp1.uw, imm * 16 + 15, imm * 16)); ''' vmovRegCoreSHIop = InstObjParams("vmov", "VmovRegCoreSH", "FpRegRegImmOp", { "code": vmovRegCoreSHCode, @@ -396,7 +400,7 @@ let {{ Fpscr = fpscr; ''' singleBinOp = "binaryOp(fpscr, FpOp1, FpOp2," + \ - "%(func)s, fpscr.fz, fpscr.rMode)" + "%(func)s, fpscr.fz, fpscr.dn, fpscr.rMode)" singleUnaryOp = "unaryOp(fpscr, FpOp1, %(func)s, fpscr.fz, fpscr.rMode)" doubleCode = ''' FPSCR fpscr = Fpscr; @@ -408,7 +412,7 @@ let {{ doubleBinOp = ''' binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - %(func)s, fpscr.fz, fpscr.rMode); + %(func)s, fpscr.fz, fpscr.dn, fpscr.rMode); ''' doubleUnaryOp = ''' unaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), %(func)s, @@ -499,8 +503,9 @@ let {{ vmlaSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, FpDest, mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, FpDest, mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vmlaSIop = InstObjParams("vmlas", "VmlaS", "FpRegRegRegOp", @@ -514,9 +519,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, dbl(FpDestP0.uw, FpDestP1.uw), - mid, fpAddD, fpscr.fz, fpscr.rMode); + mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -531,8 +537,9 @@ let {{ vmlsSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, FpDest, -mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, FpDest, -mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vmlsSIop = InstObjParams("vmlss", "VmlsS", "FpRegRegRegOp", @@ -546,9 +553,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, dbl(FpDestP0.uw, FpDestP1.uw), - -mid, fpAddD, fpscr.fz, fpscr.rMode); + -mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -563,8 +571,9 @@ let {{ vnmlaSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, -FpDest, -mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, -FpDest, -mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmlaSIop = InstObjParams("vnmlas", "VnmlaS", "FpRegRegRegOp", @@ -578,9 +587,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, -dbl(FpDestP0.uw, FpDestP1.uw), - -mid, fpAddD, fpscr.fz, fpscr.rMode); + -mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -595,8 +605,9 @@ let {{ vnmlsSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, -FpDest, mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, -FpDest, mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmlsSIop = InstObjParams("vnmlss", "VnmlsS", "FpRegRegRegOp", @@ -610,9 +621,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, -dbl(FpDestP0.uw, FpDestP1.uw), - mid, fpAddD, fpscr.fz, fpscr.rMode); + mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -626,7 +638,8 @@ let {{ vnmulSCode = ''' FPSCR fpscr = Fpscr; - FpDest = -binaryOp(fpscr, FpOp1, FpOp2, fpMulS, fpscr.fz, fpscr.rMode); + FpDest = -binaryOp(fpscr, FpOp1, FpOp2, fpMulS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmulSIop = InstObjParams("vnmuls", "VnmulS", "FpRegRegRegOp", @@ -640,7 +653,8 @@ let {{ FPSCR fpscr = Fpscr; double dest = -binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, + fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -665,7 +679,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1.uw) : "m" (FpOp1.uw)); FpDest = FpOp1.uw; __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUIntFpSIop = InstObjParams("vcvt", "VcvtUIntFpS", "FpRegRegOp", @@ -681,7 +695,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1P0.uw) : "m" (FpOp1P0.uw)); double cDest = (uint64_t)FpOp1P0.uw; __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -699,7 +713,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1.sw) : "m" (FpOp1.sw)); FpDest = FpOp1.sw; __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSIntFpSIop = InstObjParams("vcvt", "VcvtSIntFpS", "FpRegRegOp", @@ -715,7 +729,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1P0.sw) : "m" (FpOp1P0.sw)); double cDest = FpOp1P0.sw; __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -734,7 +748,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, 0, false); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUIntSRIop = InstObjParams("vcvt", "VcvtFpUIntSR", "FpRegRegOp", @@ -752,7 +766,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, false, false, 0, false); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -770,7 +784,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, 0, false); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSIntSRIop = InstObjParams("vcvtr", "VcvtFpSIntSR", "FpRegRegOp", @@ -788,7 +802,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); int64_t result = vfpFpDToFixed(cOp1, true, false, 0, false); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -807,7 +821,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, 0); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUIntSIop = InstObjParams("vcvt", "VcvtFpUIntS", "FpRegRegOp", @@ -826,7 +840,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, false, false, 0); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -845,7 +859,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, 0); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSIntSIop = InstObjParams("vcvt", "VcvtFpSIntS", "FpRegRegOp", @@ -864,7 +878,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); int64_t result = vfpFpDToFixed(cOp1, true, false, 0); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -882,7 +896,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); double cDest = fixFpSFpDDest(Fpscr, FpOp1); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -902,7 +916,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); FpDest = fixFpDFpSDest(Fpscr, cOp1); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpDFpSIop = InstObjParams("vcvt", "VcvtFpDFpS", "FpRegRegOp", @@ -917,9 +931,10 @@ let {{ vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); - FpDest = vcvtFpHFpS(fpscr, FpOp1, true); + FpDest = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, + bits(fpToBits(FpOp1), 31, 16)); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpHTFpSIop = InstObjParams("vcvtt", "VcvtFpHTFpS", "FpRegRegOp", @@ -933,9 +948,10 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); - FpDest = vcvtFpHFpS(fpscr, FpOp1, false); + FpDest = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, + bits(fpToBits(FpOp1), 15, 0)); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpHBFpSIop = InstObjParams("vcvtb", "VcvtFpHBFpS", "FpRegRegOp", @@ -949,11 +965,13 @@ let {{ FPSCR fpscr = Fpscr; vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); - __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest) - : "m" (FpOp1), "m" (FpDest)); - FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, true); - __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest.uw) + : "m" (FpOp1), "m" (FpDest.uw)); + FpDest.uw = insertBits(FpDest.uw, 31, 16,, + vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, + fpscr.rMode, fpscr.ahp, FpOp1)); + __asm__ __volatile__("" :: "m" (FpDest.uw)); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFpHTIop = InstObjParams("vcvtt", "VcvtFpSFpHT", "FpRegRegOp", @@ -967,11 +985,13 @@ let {{ FPSCR fpscr = Fpscr; vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); - __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest) - : "m" (FpOp1), "m" (FpDest)); - FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, false); - __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest.uw) + : "m" (FpOp1), "m" (FpDest.uw)); + FpDest.uw = insertBits(FpDest.uw, 15, 0, + vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, + fpscr.rMode, fpscr.ahp, FpOp1)); + __asm__ __volatile__("" :: "m" (FpDest.uw)); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFpHBIop = InstObjParams("vcvtb", "VcvtFpSFpHB", "FpRegRegOp", @@ -1201,7 +1221,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, imm); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFixedSIop = InstObjParams("vcvt", "VcvtFpSFixedS", "FpRegRegImmOp", @@ -1219,7 +1239,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, true, false, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1238,7 +1258,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, imm); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUFixedSIop = InstObjParams("vcvt", "VcvtFpUFixedS", "FpRegRegImmOp", @@ -1256,7 +1276,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, false, false, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1272,9 +1292,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.sw) : "m" (FpOp1.sw)); - FpDest = vfpSFixedToFpS(Fpscr, FpOp1.sw, false, imm); + FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.sw, false, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSFixedFpSIop = InstObjParams("vcvt", "VcvtSFixedFpS", "FpRegRegImmOp", @@ -1289,9 +1309,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpSFixedToFpD(Fpscr, mid, false, imm); + double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1307,9 +1327,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.uw) : "m" (FpOp1.uw)); - FpDest = vfpUFixedToFpS(Fpscr, FpOp1.uw, false, imm); + FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.uw, false, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUFixedFpSIop = InstObjParams("vcvt", "VcvtUFixedFpS", "FpRegRegImmOp", @@ -1324,9 +1344,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpUFixedToFpD(Fpscr, mid, false, imm); + double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1345,7 +1365,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sh = vfpFpSToFixed(FpOp1, true, true, imm); __asm__ __volatile__("" :: "m" (FpDest.sh)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSHFixedSIop = InstObjParams("vcvt", "VcvtFpSHFixedS", @@ -1364,7 +1384,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, true, true, imm); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; FpDestP1.uw = result >> 32; @@ -1384,7 +1404,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uh = vfpFpSToFixed(FpOp1, false, true, imm); __asm__ __volatile__("" :: "m" (FpDest.uh)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUHFixedSIop = InstObjParams("vcvt", "VcvtFpUHFixedS", @@ -1403,7 +1423,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, false, true, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1420,9 +1440,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.sh) : "m" (FpOp1.sh)); - FpDest = vfpSFixedToFpS(Fpscr, FpOp1.sh, true, imm); + FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.sh, true, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSHFixedFpSIop = InstObjParams("vcvt", "VcvtSHFixedFpS", @@ -1438,9 +1458,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpSFixedToFpD(Fpscr, mid, true, imm); + double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1457,9 +1477,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.uh) : "m" (FpOp1.uh)); - FpDest = vfpUFixedToFpS(Fpscr, FpOp1.uh, true, imm); + FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.uh, true, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUHFixedFpSIop = InstObjParams("vcvt", "VcvtUHFixedFpS", @@ -1475,9 +1495,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpUFixedToFpD(Fpscr, mid, true, imm); + double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); diff --git a/src/arch/arm/isa/insts/insts.isa b/src/arch/arm/isa/insts/insts.isa index a79557f3d..9c51f3cf0 100644 --- a/src/arch/arm/isa/insts/insts.isa +++ b/src/arch/arm/isa/insts/insts.isa @@ -70,5 +70,8 @@ //Divide ##include "div.isa" -//FP (VFP and Neon) +//VFP ##include "fp.isa" + +//Neon +##include "neon.isa" diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa index ca2c7c6ab..652a929f1 100644 --- a/src/arch/arm/isa/insts/macromem.isa +++ b/src/arch/arm/isa/insts/macromem.isa @@ -57,11 +57,34 @@ let {{ microLdrFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" microLdrFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrFpUop', - 'MicroMemOp', - {'memacc_code': microLdrFpUopCode, - 'ea_code': 'EA = Rb + (up ? imm : -imm);', - 'predicate_test': predicateTest}, - ['IsMicroop']) + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': + 'EA = Rb + (up ? imm : -imm);', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microLdrDBFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" + microLdrDBFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrDBFpUop', + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) + + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microLdrDTFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" + microLdrDTFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrDTFpUop', + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) - + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) microLdrRetUopCode = ''' CPSR cpsr = Cpsr; @@ -98,10 +121,36 @@ let {{ 'predicate_test': predicateTest}, ['IsMicroop']) + microStrDBFpUopCode = "Mem = cSwap(Fa.uw, ((CPSR)Cpsr).e);" + microStrDBFpUopIop = InstObjParams('strfp_uop', 'MicroStrDBFpUop', + 'MicroMemOp', + {'memacc_code': microStrFpUopCode, + 'postacc_code': "", + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) + + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microStrDTFpUopCode = "Mem = cSwap(Fa.uw, ((CPSR)Cpsr).e);" + microStrDTFpUopIop = InstObjParams('strfp_uop', 'MicroStrDTFpUop', + 'MicroMemOp', + {'memacc_code': microStrFpUopCode, + 'postacc_code': "", + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) - + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + header_output = decoder_output = exec_output = '' - loadIops = (microLdrUopIop, microLdrFpUopIop, microLdrRetUopIop) - storeIops = (microStrUopIop, microStrFpUopIop) + loadIops = (microLdrUopIop, microLdrRetUopIop, + microLdrFpUopIop, microLdrDBFpUopIop, microLdrDTFpUopIop) + storeIops = (microStrUopIop, microStrFpUopIop, + microStrDBFpUopIop, microStrDTFpUopIop) for iop in loadIops + storeIops: header_output += MicroMemDeclare.subst(iop) decoder_output += MicroMemConstructor.subst(iop) @@ -115,6 +164,403 @@ let {{ StoreCompleteAcc.subst(iop) }}; +let {{ + exec_output = header_output = '' + + eaCode = 'EA = Ra + imm;' + + for size in (1, 2, 3, 4, 6, 8, 12, 16): + # Set up the memory access. + regs = (size + 3) // 4 + subst = { "size" : size, "regs" : regs } + memDecl = ''' + union MemUnion { + uint8_t bytes[%(size)d]; + Element elements[%(size)d / sizeof(Element)]; + uint32_t floatRegBits[%(regs)d]; + }; + ''' % subst + + # Do endian conversion for all the elements. + convCode = ''' + const unsigned eCount = sizeof(memUnion.elements) / + sizeof(memUnion.elements[0]); + if (((CPSR)Cpsr).e) { + for (unsigned i = 0; i < eCount; i++) { + memUnion.elements[i] = gtobe(memUnion.elements[i]); + } + } else { + for (unsigned i = 0; i < eCount; i++) { + memUnion.elements[i] = gtole(memUnion.elements[i]); + } + } + ''' + + # Offload everything into registers + regSetCode = '' + for reg in range(regs): + mask = '' + if reg == regs - 1: + mask = ' & mask(%d)' % (32 - 8 * (regs * 4 - size)) + regSetCode += ''' + FpDestP%(reg)d.uw = gtoh(memUnion.floatRegBits[%(reg)d])%(mask)s; + ''' % { "reg" : reg, "mask" : mask } + + # Pull everything in from registers + regGetCode = '' + for reg in range(regs): + regGetCode += ''' + memUnion.floatRegBits[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + + loadMemAccCode = convCode + regSetCode + storeMemAccCode = regGetCode + convCode + + loadIop = InstObjParams('ldrneon%(size)d_uop' % subst, + 'MicroLdrNeon%(size)dUop' % subst, + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'size' : size, + 'memacc_code' : loadMemAccCode, + 'ea_code' : eaCode, + 'predicate_test' : predicateTest }, + [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) + storeIop = InstObjParams('strneon%(size)d_uop' % subst, + 'MicroStrNeon%(size)dUop' % subst, + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'size' : size, + 'memacc_code' : storeMemAccCode, + 'ea_code' : eaCode, + 'predicate_test' : predicateTest }, + [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) + + exec_output += NeonLoadExecute.subst(loadIop) + \ + NeonLoadInitiateAcc.subst(loadIop) + \ + NeonLoadCompleteAcc.subst(loadIop) + \ + NeonStoreExecute.subst(storeIop) + \ + NeonStoreInitiateAcc.subst(storeIop) + \ + NeonStoreCompleteAcc.subst(storeIop) + header_output += MicroNeonMemDeclare.subst(loadIop) + \ + MicroNeonMemDeclare.subst(storeIop) +}}; + +let {{ + exec_output = '' + for eSize, type in (1, 'uint8_t'), \ + (2, 'uint16_t'), \ + (4, 'uint32_t'), \ + (8, 'uint64_t'): + size = eSize + # An instruction handles no more than 16 bytes and no more than + # 4 elements, or the number of elements needed to fill 8 or 16 bytes. + sizes = set((16, 8)) + for count in 1, 2, 3, 4: + size = count * eSize + if size <= 16: + sizes.add(size) + for size in sizes: + substDict = { + "class_name" : "MicroLdrNeon%dUop" % size, + "targs" : type + } + exec_output += MicroNeonMemExecDeclare.subst(substDict) + substDict["class_name"] = "MicroStrNeon%dUop" % size + exec_output += MicroNeonMemExecDeclare.subst(substDict) + size += eSize +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (de)interlacing microops +// + +let {{ + header_output = exec_output = '' + for dRegs in (2, 3, 4): + loadConv = '' + unloadConv = '' + for dReg in range(dRegs): + loadConv += ''' + conv1.cRegs[%(sReg0)d] = htog(FpOp1P%(sReg0)d.uw); + conv1.cRegs[%(sReg1)d] = htog(FpOp1P%(sReg1)d.uw); + ''' % { "sReg0" : (dReg * 2), "sReg1" : (dReg * 2 + 1) } + unloadConv += ''' + FpDestS%(dReg)dP0.uw = gtoh(conv2.cRegs[2 * %(dReg)d + 0]); + FpDestS%(dReg)dP1.uw = gtoh(conv2.cRegs[2 * %(dReg)d + 1]); + ''' % { "dReg" : dReg } + microDeintNeonCode = ''' + const unsigned dRegs = %(dRegs)d; + const unsigned regs = 2 * dRegs; + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + union convStruct { + FloatRegBits cRegs[regs]; + Element elements[dRegs * perDReg]; + } conv1, conv2; + + %(loadConv)s + + unsigned srcElem = 0; + for (unsigned destOffset = 0; + destOffset < perDReg; destOffset++) { + for (unsigned dReg = 0; dReg < dRegs; dReg++) { + conv2.elements[dReg * perDReg + destOffset] = + conv1.elements[srcElem++]; + } + } + + %(unloadConv)s + ''' % { "dRegs" : dRegs, + "loadConv" : loadConv, + "unloadConv" : unloadConv } + microDeintNeonIop = \ + InstObjParams('deintneon%duop' % (dRegs * 2), + 'MicroDeintNeon%dUop' % (dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microDeintNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microDeintNeonIop) + exec_output += MicroNeonMixExecute.subst(microDeintNeonIop) + + loadConv = '' + unloadConv = '' + for dReg in range(dRegs): + loadConv += ''' + conv1.cRegs[2 * %(dReg)d + 0] = htog(FpOp1S%(dReg)dP0.uw); + conv1.cRegs[2 * %(dReg)d + 1] = htog(FpOp1S%(dReg)dP1.uw); + ''' % { "dReg" : dReg } + unloadConv += ''' + FpDestP%(sReg0)d.uw = gtoh(conv2.cRegs[%(sReg0)d]); + FpDestP%(sReg1)d.uw = gtoh(conv2.cRegs[%(sReg1)d]); + ''' % { "sReg0" : (dReg * 2), "sReg1" : (dReg * 2 + 1) } + microInterNeonCode = ''' + const unsigned dRegs = %(dRegs)d; + const unsigned regs = 2 * dRegs; + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + union convStruct { + FloatRegBits cRegs[regs]; + Element elements[dRegs * perDReg]; + } conv1, conv2; + + %(loadConv)s + + unsigned destElem = 0; + for (unsigned srcOffset = 0; + srcOffset < perDReg; srcOffset++) { + for (unsigned dReg = 0; dReg < dRegs; dReg++) { + conv2.elements[destElem++] = + conv1.elements[dReg * perDReg + srcOffset]; + } + } + + %(unloadConv)s + ''' % { "dRegs" : dRegs, + "loadConv" : loadConv, + "unloadConv" : unloadConv } + microInterNeonIop = \ + InstObjParams('interneon%duop' % (dRegs * 2), + 'MicroInterNeon%dUop' % (dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microInterNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microInterNeonIop) + exec_output += MicroNeonMixExecute.subst(microInterNeonIop) +}}; + +let {{ + exec_output = '' + for type in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'): + for dRegs in (2, 3, 4): + Name = "MicroDeintNeon%dUop" % (dRegs * 2) + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) + Name = "MicroInterNeon%dUop" % (dRegs * 2) + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon microops to pack/unpack a single lane +// + +let {{ + header_output = exec_output = '' + for sRegs in 1, 2: + baseLoadRegs = '' + for reg in range(sRegs): + baseLoadRegs += ''' + sourceRegs.fRegs[%(reg0)d] = htog(FpOp1P%(reg0)d.uw); + sourceRegs.fRegs[%(reg1)d] = htog(FpOp1P%(reg1)d.uw); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for dRegs in range(sRegs, 5): + unloadRegs = '' + loadRegs = baseLoadRegs + for reg in range(dRegs): + loadRegs += ''' + destRegs[%(reg)d].fRegs[0] = htog(FpDestS%(reg)dP0.uw); + destRegs[%(reg)d].fRegs[1] = htog(FpDestS%(reg)dP1.uw); + ''' % { "reg" : reg } + unloadRegs += ''' + FpDestS%(reg)dP0.uw = gtoh(destRegs[%(reg)d].fRegs[0]); + FpDestS%(reg)dP1.uw = gtoh(destRegs[%(reg)d].fRegs[1]); + ''' % { "reg" : reg } + microUnpackNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceRegs { + FloatRegBits fRegs[2 * %(sRegs)d]; + Element elements[%(sRegs)d * perDReg]; + } sourceRegs; + + union DestReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } destRegs[%(dRegs)d]; + + %(loadRegs)s + + for (unsigned i = 0; i < %(dRegs)d; i++) { + destRegs[i].elements[lane] = sourceRegs.elements[i]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microUnpackNeonIop = \ + InstObjParams('unpackneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroUnpackNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixLaneOp', + { 'predicate_test': predicateTest, + 'code' : microUnpackNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare.subst(microUnpackNeonIop) + exec_output += MicroNeonMixExecute.subst(microUnpackNeonIop) + + for sRegs in 1, 2: + loadRegs = '' + for reg in range(sRegs): + loadRegs += ''' + sourceRegs.fRegs[%(reg0)d] = htog(FpOp1P%(reg0)d.uw); + sourceRegs.fRegs[%(reg1)d] = htog(FpOp1P%(reg1)d.uw); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for dRegs in range(sRegs, 5): + unloadRegs = '' + for reg in range(dRegs): + unloadRegs += ''' + FpDestS%(reg)dP0.uw = gtoh(destRegs[%(reg)d].fRegs[0]); + FpDestS%(reg)dP1.uw = gtoh(destRegs[%(reg)d].fRegs[1]); + ''' % { "reg" : reg } + microUnpackAllNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceRegs { + FloatRegBits fRegs[2 * %(sRegs)d]; + Element elements[%(sRegs)d * perDReg]; + } sourceRegs; + + union DestReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } destRegs[%(dRegs)d]; + + %(loadRegs)s + + for (unsigned i = 0; i < %(dRegs)d; i++) { + for (unsigned j = 0; j < perDReg; j++) + destRegs[i].elements[j] = sourceRegs.elements[i]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microUnpackAllNeonIop = \ + InstObjParams('unpackallneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroUnpackAllNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microUnpackAllNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microUnpackAllNeonIop) + exec_output += MicroNeonMixExecute.subst(microUnpackAllNeonIop) + + for dRegs in 1, 2: + unloadRegs = '' + for reg in range(dRegs): + unloadRegs += ''' + FpDestP%(reg0)d.uw = gtoh(destRegs.fRegs[%(reg0)d]); + FpDestP%(reg1)d.uw = gtoh(destRegs.fRegs[%(reg1)d]); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for sRegs in range(dRegs, 5): + loadRegs = '' + for reg in range(sRegs): + loadRegs += ''' + sourceRegs[%(reg)d].fRegs[0] = htog(FpOp1S%(reg)dP0.uw); + sourceRegs[%(reg)d].fRegs[1] = htog(FpOp1S%(reg)dP1.uw); + ''' % { "reg" : reg } + microPackNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } sourceRegs[%(sRegs)d]; + + union DestRegs { + FloatRegBits fRegs[2 * %(dRegs)d]; + Element elements[%(dRegs)d * perDReg]; + } destRegs; + + %(loadRegs)s + + for (unsigned i = 0; i < %(sRegs)d; i++) { + destRegs.elements[i] = sourceRegs[i].elements[lane]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microPackNeonIop = \ + InstObjParams('packneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroPackNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixLaneOp', + { 'predicate_test': predicateTest, + 'code' : microPackNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare.subst(microPackNeonIop) + exec_output += MicroNeonMixExecute.subst(microPackNeonIop) +}}; + +let {{ + exec_output = '' + for type in ('uint8_t', 'uint16_t', 'uint32_t'): + for sRegs in 1, 2: + for dRegs in range(sRegs, 5): + for format in ("MicroUnpackNeon%(sRegs)dto%(dRegs)dUop", + "MicroUnpackAllNeon%(sRegs)dto%(dRegs)dUop", + "MicroPackNeon%(dRegs)dto%(sRegs)dUop"): + Name = format % { "sRegs" : sRegs * 2, + "dRegs" : dRegs * 2 } + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) +}}; + //////////////////////////////////////////////////////////////////// // // Integer = Integer op Immediate microops @@ -122,23 +568,32 @@ let {{ let {{ microAddiUopIop = InstObjParams('addi_uop', 'MicroAddiUop', - 'MicroIntOp', + 'MicroIntImmOp', {'code': 'Ra = Rb + imm;', 'predicate_test': predicateTest}, ['IsMicroop']) + microAddUopIop = InstObjParams('add_uop', 'MicroAddUop', + 'MicroIntOp', + {'code': 'Ra = Rb + Rc;', + 'predicate_test': predicateTest}, + ['IsMicroop']) + microSubiUopIop = InstObjParams('subi_uop', 'MicroSubiUop', - 'MicroIntOp', + 'MicroIntImmOp', {'code': 'Ra = Rb - imm;', 'predicate_test': predicateTest}, ['IsMicroop']) - header_output = MicroIntDeclare.subst(microAddiUopIop) + \ - MicroIntDeclare.subst(microSubiUopIop) - decoder_output = MicroIntConstructor.subst(microAddiUopIop) + \ - MicroIntConstructor.subst(microSubiUopIop) + header_output = MicroIntImmDeclare.subst(microAddiUopIop) + \ + MicroIntImmDeclare.subst(microSubiUopIop) + \ + MicroIntDeclare.subst(microAddUopIop) + decoder_output = MicroIntImmConstructor.subst(microAddiUopIop) + \ + MicroIntImmConstructor.subst(microSubiUopIop) + \ + MicroIntConstructor.subst(microAddUopIop) exec_output = PredOpExecute.subst(microAddiUopIop) + \ - PredOpExecute.subst(microSubiUopIop) + PredOpExecute.subst(microSubiUopIop) + \ + PredOpExecute.subst(microAddUopIop) }}; let {{ @@ -146,6 +601,22 @@ let {{ header_output = MacroMemDeclare.subst(iop) decoder_output = MacroMemConstructor.subst(iop) + iop = InstObjParams("vldmult", "VldMult", 'VldMultOp', "", []) + header_output += VMemMultDeclare.subst(iop) + decoder_output += VMemMultConstructor.subst(iop) + + iop = InstObjParams("vldsingle", "VldSingle", 'VldSingleOp', "", []) + header_output += VMemSingleDeclare.subst(iop) + decoder_output += VMemSingleConstructor.subst(iop) + + iop = InstObjParams("vstmult", "VstMult", 'VstMultOp', "", []) + header_output += VMemMultDeclare.subst(iop) + decoder_output += VMemMultConstructor.subst(iop) + + iop = InstObjParams("vstsingle", "VstSingle", 'VstSingleOp', "", []) + header_output += VMemSingleDeclare.subst(iop) + decoder_output += VMemSingleConstructor.subst(iop) + vfpIop = InstObjParams("vldmstm", "VLdmStm", 'MacroVFPMemOp', "", []) header_output += MacroVFPMemDeclare.subst(vfpIop) decoder_output += MacroVFPMemConstructor.subst(vfpIop) diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa new file mode 100644 index 000000000..b629c6fe8 --- /dev/null +++ b/src/arch/arm/isa/insts/neon.isa @@ -0,0 +1,3343 @@ +// -*- mode:c++ -*- + +// Copyright (c) 2010 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Gabe Black + +output header {{ + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUThreeUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, op2); + case 1: + return new Base<uint16_t>(machInst, dest, op1, op2); + case 2: + return new Base<uint32_t>(machInst, dest, op1, op2); + case 3: + return new Base<uint64_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSThreeUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, op2); + case 1: + return new Base<int16_t>(machInst, dest, op1, op2); + case 2: + return new Base<int32_t>(machInst, dest, op1, op2); + case 3: + return new Base<int64_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUSThreeUReg(bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeUReg<Base>(size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUReg<Base>(size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUThreeUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, op2); + case 1: + return new Base<uint16_t>(machInst, dest, op1, op2); + case 2: + return new Base<uint32_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSThreeUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, op2); + case 1: + return new Base<int16_t>(machInst, dest, op1, op2); + case 2: + return new Base<int32_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUSThreeUSReg(bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeUSReg<Base>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUSReg<Base>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUThreeSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonUThreeUSReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonUThreeUSReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSThreeSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonSThreeUSReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUSReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSThreeSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUThreeReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonUThreeUReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonUThreeUReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSThreeReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonSThreeUReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSThreeReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoShiftReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + switch (size) { + case 0: + return new BaseQ<uint8_t>(machInst, dest, op1, imm); + case 1: + return new BaseQ<uint16_t>(machInst, dest, op1, imm); + case 2: + return new BaseQ<uint32_t>(machInst, dest, op1, imm); + case 3: + return new BaseQ<uint64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 0: + return new BaseD<uint8_t>(machInst, dest, op1, imm); + case 1: + return new BaseD<uint16_t>(machInst, dest, op1, imm); + case 2: + return new BaseD<uint32_t>(machInst, dest, op1, imm); + case 3: + return new BaseD<uint64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoShiftReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + switch (size) { + case 0: + return new BaseQ<int8_t>(machInst, dest, op1, imm); + case 1: + return new BaseQ<int16_t>(machInst, dest, op1, imm); + case 2: + return new BaseQ<int32_t>(machInst, dest, op1, imm); + case 3: + return new BaseQ<int64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 0: + return new BaseD<int8_t>(machInst, dest, op1, imm); + case 1: + return new BaseD<int16_t>(machInst, dest, op1, imm); + case 2: + return new BaseD<int32_t>(machInst, dest, op1, imm); + case 3: + return new BaseD<int64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + } + + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoShiftReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (notSigned) { + return decodeNeonUTwoShiftReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoShiftUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, imm); + case 1: + return new Base<uint16_t>(machInst, dest, op1, imm); + case 2: + return new Base<uint32_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoShiftSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + return decodeNeonUTwoShiftUSReg<BaseQ>( + size, machInst, dest, op1, imm); + } else { + return decodeNeonUTwoShiftUSReg<BaseD>( + size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoShiftUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, imm); + case 1: + return new Base<int16_t>(machInst, dest, op1, imm); + case 2: + return new Base<int32_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoShiftSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + return decodeNeonSTwoShiftUSReg<BaseQ>( + size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftUSReg<BaseD>( + size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoShiftSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (notSigned) { + return decodeNeonUTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoMiscUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1); + case 1: + return new Base<uint16_t>(machInst, dest, op1); + case 2: + return new Base<uint32_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoMiscUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1); + case 1: + return new Base<int16_t>(machInst, dest, op1); + case 2: + return new Base<int32_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoMiscSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonUTwoMiscUSReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonUTwoMiscUSReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoMiscSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonSTwoMiscUSReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonSTwoMiscUSReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoMiscUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1); + case 1: + return new Base<uint16_t>(machInst, dest, op1); + case 2: + return new Base<uint32_t>(machInst, dest, op1); + case 3: + return new Base<uint64_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoMiscUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1); + case 1: + return new Base<int16_t>(machInst, dest, op1); + case 2: + return new Base<int32_t>(machInst, dest, op1); + case 3: + return new Base<int64_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoMiscReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonSTwoMiscUReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonSTwoMiscUReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoMiscReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonUTwoMiscUReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonUTwoMiscUReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoMiscSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (notSigned) { + return decodeNeonUTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1); + } else { + return decodeNeonSTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1); + } + } + +}}; + +output exec {{ + static float + vcgtFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 > op2) ? 0.0 : 1.0; + } + + static float + vcgeFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 >= op2) ? 0.0 : 1.0; + } + + static float + vceqFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 == op2) ? 0.0 : 1.0; + } + + static float + vcleFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 <= op2) ? 0.0 : 1.0; + } + + static float + vcltFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 < op2) ? 0.0 : 1.0; + } + + static float + vacgtFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (fabsf(op1) > fabsf(op2)) ? 0.0 : 1.0; + } + + static float + vacgeFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (fabsf(op1) >= fabsf(op2)) ? 0.0 : 1.0; + } +}}; + +let {{ + + header_output = "" + exec_output = "" + + smallUnsignedTypes = ("uint8_t", "uint16_t", "uint32_t") + unsignedTypes = smallUnsignedTypes + ("uint64_t",) + smallSignedTypes = ("int8_t", "int16_t", "int32_t") + signedTypes = smallSignedTypes + ("int64_t",) + smallTypes = smallUnsignedTypes + smallSignedTypes + allTypes = unsignedTypes + signedTypes + + def threeEqualRegInst(name, Name, types, rCount, op, + readDest=False, pairwise=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + if pairwise: + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i] : + srcReg2.elements[2 * i - eCount]); + Element srcElem2 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i + 1] : + srcReg2.elements[2 * i + 1 - eCount]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + else: + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeEqualRegInstFp(name, Name, types, rCount, op, + readDest=False, pairwise=False, toInt=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1, srcRegs2; + ''' + if toInt: + eWalkCode += 'RegVect destRegs;\n' + else: + eWalkCode += 'FloatVect destRegs;\n' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + srcRegs2[%(reg)d] = FpOp2P%(reg)d; + ''' % { "reg" : reg } + if readDest: + if toInt: + eWalkCode += ''' + destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[r];' + destType = 'FloatReg' + writeDest = 'destRegs[r] = destReg;' + if toInt: + destType = 'FloatRegBits' + writeDest = 'destRegs.regs[r] = destReg;' + if pairwise: + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = (2 * r < rCount) ? + srcRegs1[2 * r] : srcRegs2[2 * r - rCount]; + FloatReg srcReg2 = (2 * r < rCount) ? + srcRegs1[2 * r + 1] : srcRegs2[2 * r + 1 - rCount]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + else: + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = srcRegs1[r]; + FloatReg srcReg2 = srcRegs2[r]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + for reg in range(rCount): + if toInt: + eWalkCode += ''' + FpDestP%(reg)d.uw = destRegs.regs[%(reg)d]; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeUnequalRegInst(name, Name, types, op, + bigSrc1, bigSrc2, bigDest, readDest): + global header_output, exec_output + src1Cnt = src2Cnt = destCnt = 2 + src1Prefix = src2Prefix = destPrefix = '' + if bigSrc1: + src1Cnt = 4 + src1Prefix = 'Big' + if bigSrc2: + src2Cnt = 4 + src2Prefix = 'Big' + if bigDest: + destCnt = 4 + destPrefix = 'Big' + eWalkCode = ''' + %sRegVect srcReg1; + %sRegVect srcReg2; + %sRegVect destReg; + ''' % (src1Prefix, src2Prefix, destPrefix) + for reg in range(src1Cnt): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + for reg in range(src2Cnt): + eWalkCode += ''' + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(destCnt): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]); + %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[i]); + %(destPrefix)sElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix, + "destPrefix" : destPrefix } + for reg in range(destCnt): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeRegNarrowInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + True, True, False, readDest) + + def threeRegLongInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + False, False, True, readDest) + + def threeRegWideInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + True, False, True, readDest) + + def twoEqualRegInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + assert(imm >= 0 && imm < eCount); + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[imm]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongInst(name, Name, types, op, readDest=False): + global header_output, exec_output + rCount = 2 + eWalkCode = ''' + RegVect srcReg1, srcReg2; + BigRegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw);; + ''' % { "reg" : reg } + if readDest: + for reg in range(2 * rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + assert(imm >= 0 && imm < eCount); + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[imm]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2 * rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoEqualRegInstFp(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1, srcRegs2, destRegs; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + srcRegs2[%(reg)d] = FpOp2P%(reg)d; + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[i];' + eWalkCode += ''' + assert(imm >= 0 && imm < rCount); + for (unsigned i = 0; i < rCount; i++) { + FloatReg srcReg1 = srcRegs1[i]; + FloatReg srcReg2 = srcRegs2[imm]; + FloatReg destReg; + %(readDest)s + %(op)s + destRegs[i] = destReg; + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegShiftInst(name, Name, types, rCount, op, + readDest=False, toInt=False, fromInt=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcRegs1, destRegs; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destRegs.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destRegs.elements[i]);' + if toInt: + readDestCode = 'destReg = gtoh(destRegs.regs[i]);' + readOpCode = 'Element srcElem1 = gtoh(srcRegs1.elements[i]);' + if fromInt: + readOpCode = 'FloatRegBits srcReg1 = gtoh(srcRegs1.regs[i]);' + declDest = 'Element destElem;' + writeDestCode = 'destRegs.elements[i] = htog(destElem);' + if toInt: + declDest = 'FloatRegBits destReg;' + writeDestCode = 'destRegs.regs[i] = htog(destReg);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(readOp)s + %(declDest)s + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "readOp" : readOpCode, + "declDest" : declDest, + "readDest" : readDestCode, + "op" : op, + "writeDest" : writeDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destRegs.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegNarrowShiftInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + BigRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(2): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + BigElement srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongShiftInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1; + BigRegVect destReg; + ''' + for reg in range(2): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(4): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(4): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + unsigned j = i; + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[j] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscScInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[imm]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscScramble(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + FpOp1P%(reg)d.uw = gtoh(srcReg1.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscInstFp(name, Name, types, rCount, op, + readDest=False, toInt=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1; + ''' + if toInt: + eWalkCode += 'RegVect destRegs;\n' + else: + eWalkCode += 'FloatVect destRegs;\n' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + ''' % { "reg" : reg } + if readDest: + if toInt: + eWalkCode += ''' + destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[i];' + destType = 'FloatReg' + writeDest = 'destRegs[r] = destReg;' + if toInt: + destType = 'FloatRegBits' + writeDest = 'destRegs.regs[r] = destReg;' + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = srcRegs1[r]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + for reg in range(rCount): + if toInt: + eWalkCode += ''' + FpDestP%(reg)d.uw = destRegs.regs[%(reg)d]; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegCondenseInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcRegs; + BigRegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount / 2; i++) { + Element srcElem1 = gtoh(srcRegs.elements[2 * i]); + Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegNarrowMiscInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + BigRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(2): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + BigElement srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def oneRegImmInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect destReg; + ''' + if readDest: + for reg in range(rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongMiscInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1; + BigRegVect destReg; + ''' + for reg in range(2): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(4): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(4): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + vhaddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1)) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInst("vhadd", "VhaddD", allTypes, 2, vhaddCode) + threeEqualRegInst("vhadd", "VhaddQ", allTypes, 4, vhaddCode) + + vrhaddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1) + 1) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInst("vrhadd", "VrhaddD", allTypes, 2, vrhaddCode) + threeEqualRegInst("vrhadd", "VrhaddQ", allTypes, 4, vrhaddCode) + + vhsubCode = ''' + Element barrowBit = + (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) - + ((srcElem2 & ~(Element)1) / 2)) - barrowBit; + ''' + threeEqualRegInst("vhsub", "VhsubD", allTypes, 2, vhsubCode) + threeEqualRegInst("vhsub", "VhsubQ", allTypes, 4, vhsubCode) + + vandCode = ''' + destElem = srcElem1 & srcElem2; + ''' + threeEqualRegInst("vand", "VandD", unsignedTypes, 2, vandCode) + threeEqualRegInst("vand", "VandQ", unsignedTypes, 4, vandCode) + + vbicCode = ''' + destElem = srcElem1 & ~srcElem2; + ''' + threeEqualRegInst("vbic", "VbicD", unsignedTypes, 2, vbicCode) + threeEqualRegInst("vbic", "VbicQ", unsignedTypes, 4, vbicCode) + + vorrCode = ''' + destElem = srcElem1 | srcElem2; + ''' + threeEqualRegInst("vorr", "VorrD", unsignedTypes, 2, vorrCode) + threeEqualRegInst("vorr", "VorrQ", unsignedTypes, 4, vorrCode) + + threeEqualRegInst("vmov", "VmovD", unsignedTypes, 2, vorrCode) + threeEqualRegInst("vmov", "VmovQ", unsignedTypes, 4, vorrCode) + + vornCode = ''' + destElem = srcElem1 | ~srcElem2; + ''' + threeEqualRegInst("vorn", "VornD", unsignedTypes, 2, vornCode) + threeEqualRegInst("vorn", "VornQ", unsignedTypes, 4, vornCode) + + veorCode = ''' + destElem = srcElem1 ^ srcElem2; + ''' + threeEqualRegInst("veor", "VeorD", unsignedTypes, 2, veorCode) + threeEqualRegInst("veor", "VeorQ", unsignedTypes, 4, veorCode) + + vbifCode = ''' + destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2); + ''' + threeEqualRegInst("vbif", "VbifD", unsignedTypes, 2, vbifCode, True) + threeEqualRegInst("vbif", "VbifQ", unsignedTypes, 4, vbifCode, True) + vbitCode = ''' + destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2); + ''' + threeEqualRegInst("vbit", "VbitD", unsignedTypes, 2, vbitCode, True) + threeEqualRegInst("vbit", "VbitQ", unsignedTypes, 4, vbitCode, True) + vbslCode = ''' + destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem); + ''' + threeEqualRegInst("vbsl", "VbslD", unsignedTypes, 2, vbslCode, True) + threeEqualRegInst("vbsl", "VbslQ", unsignedTypes, 4, vbslCode, True) + + vmaxCode = ''' + destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2; + ''' + threeEqualRegInst("vmax", "VmaxD", allTypes, 2, vmaxCode) + threeEqualRegInst("vmax", "VmaxQ", allTypes, 4, vmaxCode) + + vminCode = ''' + destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2; + ''' + threeEqualRegInst("vmin", "VminD", allTypes, 2, vminCode) + threeEqualRegInst("vmin", "VminQ", allTypes, 4, vminCode) + + vaddCode = ''' + destElem = srcElem1 + srcElem2; + ''' + threeEqualRegInst("vadd", "NVaddD", unsignedTypes, 2, vaddCode) + threeEqualRegInst("vadd", "NVaddQ", unsignedTypes, 4, vaddCode) + + threeEqualRegInst("vpadd", "NVpaddD", unsignedTypes, + 2, vaddCode, pairwise=True) + threeEqualRegInst("vpadd", "NVpaddQ", unsignedTypes, + 4, vaddCode, pairwise=True) + vaddlwCode = ''' + destElem = (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + threeRegLongInst("vaddl", "Vaddl", smallTypes, vaddlwCode) + threeRegWideInst("vaddw", "Vaddw", smallTypes, vaddlwCode) + vaddhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vaddhn", "Vaddhn", smallTypes, vaddhnCode) + vraddhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vraddhn", "Vraddhn", smallTypes, vraddhnCode) + + vsubCode = ''' + destElem = srcElem1 - srcElem2; + ''' + threeEqualRegInst("vsub", "NVsubD", unsignedTypes, 2, vsubCode) + threeEqualRegInst("vsub", "NVsubQ", unsignedTypes, 4, vsubCode) + vsublwCode = ''' + destElem = (BigElement)srcElem1 - (BigElement)srcElem2; + ''' + threeRegLongInst("vsubl", "Vsubl", smallTypes, vsublwCode) + threeRegWideInst("vsubw", "Vsubw", smallTypes, vsublwCode) + + vqaddUCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (destElem < srcElem1 || destElem < srcElem2) { + destElem = (Element)(-1); + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqadd", "VqaddUD", unsignedTypes, 2, vqaddUCode) + threeEqualRegInst("vqadd", "VqaddUQ", unsignedTypes, 4, vqaddUCode) + vsubhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vsubhn", "Vsubhn", smallTypes, vsubhnCode) + vrsubhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vrsubhn", "Vrsubhn", smallTypes, vrsubhnCode) + + vqaddSCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool negSrc2 = (srcElem2 < 0); + if ((negDest != negSrc1) && (negSrc1 == negSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqadd", "VqaddSD", signedTypes, 2, vqaddSCode) + threeEqualRegInst("vqadd", "VqaddSQ", signedTypes, 4, vqaddSCode) + + vqsubUCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (destElem > srcElem1) { + destElem = 0; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqsub", "VqsubUD", unsignedTypes, 2, vqsubUCode) + threeEqualRegInst("vqsub", "VqsubUQ", unsignedTypes, 4, vqsubUCode) + + vqsubSCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool posSrc2 = (srcElem2 >= 0); + if ((negDest != negSrc1) && (negSrc1 == posSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqsub", "VqsubSD", signedTypes, 2, vqsubSCode) + threeEqualRegInst("vqsub", "VqsubSQ", signedTypes, 4, vqsubSCode) + + vcgtCode = ''' + destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vcgt", "VcgtD", allTypes, 2, vcgtCode) + threeEqualRegInst("vcgt", "VcgtQ", allTypes, 4, vcgtCode) + + vcgeCode = ''' + destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vcge", "VcgeD", allTypes, 2, vcgeCode) + threeEqualRegInst("vcge", "VcgeQ", allTypes, 4, vcgeCode) + + vceqCode = ''' + destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vceq", "VceqD", unsignedTypes, 2, vceqCode) + threeEqualRegInst("vceq", "VceqQ", unsignedTypes, 4, vceqCode) + + vshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } + ''' + threeEqualRegInst("vshl", "VshlD", allTypes, 2, vshlCode) + threeEqualRegInst("vshl", "VshlQ", allTypes, 4, vshlCode) + + vrshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } else { + destElem = srcElem1; + } + ''' + threeEqualRegInst("vrshl", "VrshlD", allTypes, 2, vrshlCode) + threeEqualRegInst("vrshl", "VrshlQ", allTypes, 4, vrshlCode) + + vqshlUCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqshl", "VqshlUD", unsignedTypes, 2, vqshlUCode) + threeEqualRegInst("vqshl", "VqshlUQ", unsignedTypes, 4, vqshlUCode) + + vqshlSCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqshl", "VqshlSD", signedTypes, 2, vqshlSCode) + threeEqualRegInst("vqshl", "VqshlSQ", signedTypes, 4, vqshlSCode) + + vqrshlUCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrshl", "VqrshlUD", unsignedTypes, 2, vqrshlUCode) + threeEqualRegInst("vqrshl", "VqrshlUQ", unsignedTypes, 4, vqrshlUCode) + + vqrshlSCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrshl", "VqrshlSD", signedTypes, 2, vqrshlSCode) + threeEqualRegInst("vqrshl", "VqrshlSQ", signedTypes, 4, vqrshlSCode) + + vabaCode = ''' + destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInst("vaba", "VabaD", allTypes, 2, vabaCode, True) + threeEqualRegInst("vaba", "VabaQ", allTypes, 4, vabaCode, True) + vabalCode = ''' + destElem += (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInst("vabal", "Vabal", smallTypes, vabalCode, True) + + vabdCode = ''' + destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInst("vabd", "VabdD", allTypes, 2, vabdCode) + threeEqualRegInst("vabd", "VabdQ", allTypes, 4, vabdCode) + vabdlCode = ''' + destElem = (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInst("vabdl", "Vabdl", smallTypes, vabdlCode) + + vtstCode = ''' + destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vtst", "VtstD", unsignedTypes, 2, vtstCode) + threeEqualRegInst("vtst", "VtstQ", unsignedTypes, 4, vtstCode) + + vmulCode = ''' + destElem = srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmul", "NVmulD", allTypes, 2, vmulCode) + threeEqualRegInst("vmul", "NVmulQ", allTypes, 4, vmulCode) + vmullCode = ''' + destElem = (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmull", "Vmull", smallTypes, vmullCode) + + vmlaCode = ''' + destElem = destElem + srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmla", "NVmlaD", allTypes, 2, vmlaCode, True) + threeEqualRegInst("vmla", "NVmlaQ", allTypes, 4, vmlaCode, True) + vmlalCode = ''' + destElem = destElem + (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmlal", "Vmlal", smallTypes, vmlalCode, True) + + vqdmlalCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = (destElem < 0); + destElem += midElem; + bool negDest = (destElem < 0); + bool negMid = (midElem < 0); + if (negPreDest == negMid && negMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmlal", "Vqdmlal", smallTypes, vqdmlalCode, True) + + vqdmlslCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = (destElem < 0); + destElem -= midElem; + bool negDest = (destElem < 0); + bool posMid = (midElem > 0); + if (negPreDest == posMid && posMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmlsl", "Vqdmlsl", smallTypes, vqdmlslCode, True) + + vqdmullCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (Element)(sizeof(Element) * 8 - 1))) { + destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmull", "Vqdmull", smallTypes, vqdmullCode) + + vmlsCode = ''' + destElem = destElem - srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmls", "NVmlsD", allTypes, 2, vmlsCode, True) + threeEqualRegInst("vmls", "NVmlsQ", allTypes, 4, vmlsCode, True) + vmlslCode = ''' + destElem = destElem - (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmlsl", "Vmlsl", smallTypes, vmlslCode, True) + + vmulpCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= srcElem1 << j; + } + ''' + threeEqualRegInst("vmul", "NVmulpD", unsignedTypes, 2, vmulpCode) + threeEqualRegInst("vmul", "NVmulpQ", unsignedTypes, 4, vmulpCode) + vmullpCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= (BigElement)srcElem1 << j; + } + ''' + threeRegLongInst("vmull", "Vmullp", smallUnsignedTypes, vmullpCode) + + threeEqualRegInst("vpmax", "VpmaxD", allTypes, 2, vmaxCode, pairwise=True) + threeEqualRegInst("vpmax", "VpmaxQ", allTypes, 4, vmaxCode, pairwise=True) + + threeEqualRegInst("vpmin", "VpminD", allTypes, 2, vminCode, pairwise=True) + threeEqualRegInst("vpmin", "VpminQ", allTypes, 4, vminCode, pairwise=True) + + vqdmulhCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >> + (sizeof(Element) * 8); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (sizeof(Element) * 8 - 1))) { + destElem = ~srcElem1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqdmulh", "VqdmulhD", smallSignedTypes, 2, vqdmulhCode) + threeEqualRegInst("vqdmulh", "VqdmulhQ", smallSignedTypes, 4, vqdmulhCode) + + vqrdmulhCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 + + ((int64_t)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + if (destElem < 0) { + destElem = mask(sizeof(Element) * 8 - 1); + } else { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + } + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrdmulh", "VqrdmulhD", + smallSignedTypes, 2, vqrdmulhCode) + threeEqualRegInst("vqrdmulh", "VqrdmulhQ", + smallSignedTypes, 4, vqrdmulhCode) + + vmaxfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + bool done; + destReg = processNans(fpscr, done, true, srcReg1, srcReg2); + if (!done) { + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMaxS, + true, true, VfpRoundNearest); + } else if (flushToZero(srcReg1, srcReg2)) { + fpscr.idc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmax", "VmaxDFp", ("float",), 2, vmaxfpCode) + threeEqualRegInstFp("vmax", "VmaxQFp", ("float",), 4, vmaxfpCode) + + vminfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + bool done; + destReg = processNans(fpscr, done, true, srcReg1, srcReg2); + if (!done) { + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMinS, + true, true, VfpRoundNearest); + } else if (flushToZero(srcReg1, srcReg2)) { + fpscr.idc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmin", "VminDFp", ("float",), 2, vminfpCode) + threeEqualRegInstFp("vmin", "VminQFp", ("float",), 4, vminfpCode) + + threeEqualRegInstFp("vpmax", "VpmaxDFp", ("float",), + 2, vmaxfpCode, pairwise=True) + threeEqualRegInstFp("vpmax", "VpmaxQFp", ("float",), + 4, vmaxfpCode, pairwise=True) + + threeEqualRegInstFp("vpmin", "VpminDFp", ("float",), + 2, vminfpCode, pairwise=True) + threeEqualRegInstFp("vpmin", "VpminQFp", ("float",), + 4, vminfpCode, pairwise=True) + + vaddfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpAddS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vadd", "VaddDFp", ("float",), 2, vaddfpCode) + threeEqualRegInstFp("vadd", "VaddQFp", ("float",), 4, vaddfpCode) + + threeEqualRegInstFp("vpadd", "VpaddDFp", ("float",), + 2, vaddfpCode, pairwise=True) + threeEqualRegInstFp("vpadd", "VpaddQFp", ("float",), + 4, vaddfpCode, pairwise=True) + + vsubfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpSubS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vsub", "VsubDFp", ("float",), 2, vsubfpCode) + threeEqualRegInstFp("vsub", "VsubQFp", ("float",), 4, vsubfpCode) + + vmulfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmul", "NVmulDFp", ("float",), 2, vmulfpCode) + threeEqualRegInstFp("vmul", "NVmulQFp", ("float",), 4, vmulfpCode) + + vmlafpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + destReg = binaryOp(fpscr, mid, destReg, fpAddS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmla", "NVmlaDFp", ("float",), 2, vmlafpCode, True) + threeEqualRegInstFp("vmla", "NVmlaQFp", ("float",), 4, vmlafpCode, True) + + vmlsfpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + destReg = binaryOp(fpscr, destReg, mid, fpSubS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmls", "NVmlsDFp", ("float",), 2, vmlsfpCode, True) + threeEqualRegInstFp("vmls", "NVmlsQFp", ("float",), 4, vmlsfpCode, True) + + vcgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vcgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vcgt", "VcgtDFp", ("float",), + 2, vcgtfpCode, toInt = True) + threeEqualRegInstFp("vcgt", "VcgtQFp", ("float",), + 4, vcgtfpCode, toInt = True) + + vcgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vcgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vcge", "VcgeDFp", ("float",), + 2, vcgefpCode, toInt = True) + threeEqualRegInstFp("vcge", "VcgeQFp", ("float",), + 4, vcgefpCode, toInt = True) + + vacgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vacgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vacgt", "VacgtDFp", ("float",), + 2, vacgtfpCode, toInt = True) + threeEqualRegInstFp("vacgt", "VacgtQFp", ("float",), + 4, vacgtfpCode, toInt = True) + + vacgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vacgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vacge", "VacgeDFp", ("float",), + 2, vacgefpCode, toInt = True) + threeEqualRegInstFp("vacge", "VacgeQFp", ("float",), + 4, vacgefpCode, toInt = True) + + vceqfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vceqFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vceq", "VceqDFp", ("float",), + 2, vceqfpCode, toInt = True) + threeEqualRegInstFp("vceq", "VceqQFp", ("float",), + 4, vceqfpCode, toInt = True) + + vrecpsCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRecpsS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vrecps", "VrecpsDFp", ("float",), 2, vrecpsCode) + threeEqualRegInstFp("vrecps", "VrecpsQFp", ("float",), 4, vrecpsCode) + + vrsqrtsCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRSqrtsS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vrsqrts", "VrsqrtsDFp", ("float",), 2, vrsqrtsCode) + threeEqualRegInstFp("vrsqrts", "VrsqrtsQFp", ("float",), 4, vrsqrtsCode) + + vabdfpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpSubS, + true, true, VfpRoundNearest); + destReg = fabs(mid); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vabd", "VabdDFp", ("float",), 2, vabdfpCode) + threeEqualRegInstFp("vabd", "VabdQFp", ("float",), 4, vabdfpCode) + + twoEqualRegInst("vmla", "VmlasD", unsignedTypes, 2, vmlaCode, True) + twoEqualRegInst("vmla", "VmlasQ", unsignedTypes, 4, vmlaCode, True) + twoEqualRegInstFp("vmla", "VmlasDFp", ("float",), 2, vmlafpCode, True) + twoEqualRegInstFp("vmla", "VmlasQFp", ("float",), 4, vmlafpCode, True) + twoRegLongInst("vmlal", "Vmlals", smallTypes, vmlalCode, True) + + twoEqualRegInst("vmls", "VmlssD", allTypes, 2, vmlsCode, True) + twoEqualRegInst("vmls", "VmlssQ", allTypes, 4, vmlsCode, True) + twoEqualRegInstFp("vmls", "VmlssDFp", ("float",), 2, vmlsfpCode, True) + twoEqualRegInstFp("vmls", "VmlssQFp", ("float",), 4, vmlsfpCode, True) + twoRegLongInst("vmlsl", "Vmlsls", smallTypes, vmlslCode, True) + + twoEqualRegInst("vmul", "VmulsD", allTypes, 2, vmulCode) + twoEqualRegInst("vmul", "VmulsQ", allTypes, 4, vmulCode) + twoEqualRegInstFp("vmul", "VmulsDFp", ("float",), 2, vmulfpCode) + twoEqualRegInstFp("vmul", "VmulsQFp", ("float",), 4, vmulfpCode) + twoRegLongInst("vmull", "Vmulls", smallTypes, vmullCode) + + twoRegLongInst("vqdmull", "Vqdmulls", smallTypes, vqdmullCode) + twoRegLongInst("vqdmlal", "Vqdmlals", smallTypes, vqdmlalCode, True) + twoRegLongInst("vqdmlsl", "Vqdmlsls", smallTypes, vqdmlslCode, True) + twoEqualRegInst("vqdmulh", "VqdmulhsD", smallSignedTypes, 2, vqdmulhCode) + twoEqualRegInst("vqdmulh", "VqdmulhsQ", smallSignedTypes, 4, vqdmulhCode) + twoEqualRegInst("vqrdmulh", "VqrdmulhsD", + smallSignedTypes, 2, vqrdmulhCode) + twoEqualRegInst("vqrdmulh", "VqrdmulhsQ", + smallSignedTypes, 4, vqrdmulhCode) + + vshrCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + if (srcElem1 < 0) + destElem = -1; + else + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoRegShiftInst("vshr", "NVshrD", allTypes, 2, vshrCode) + twoRegShiftInst("vshr", "NVshrQ", allTypes, 4, vshrCode) + + vsraCode = ''' + Element mid;; + if (imm >= sizeof(srcElem1) * 8) { + mid = (srcElem1 < 0) ? -1 : 0; + } else { + mid = srcElem1 >> imm; + if (srcElem1 < 0 && mid >= 0) { + mid |= -(mid & ((Element)1 << + (sizeof(Element) * 8 - 1 - imm))); + } + } + destElem += mid; + ''' + twoRegShiftInst("vsra", "NVsraD", allTypes, 2, vsraCode, True) + twoRegShiftInst("vsra", "NVsraQ", allTypes, 4, vsraCode, True) + + vrshrCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoRegShiftInst("vrshr", "NVrshrD", allTypes, 2, vrshrCode) + twoRegShiftInst("vrshr", "NVrshrQ", allTypes, 4, vrshrCode) + + vrsraCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem += 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem += srcElem1; + } + ''' + twoRegShiftInst("vrsra", "NVrsraD", allTypes, 2, vrsraCode, True) + twoRegShiftInst("vrsra", "NVrsraQ", allTypes, 4, vrsraCode, True) + + vsriCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 >> imm) | + (destElem & ~mask(sizeof(Element) * 8 - imm)); + ''' + twoRegShiftInst("vsri", "NVsriD", unsignedTypes, 2, vsriCode, True) + twoRegShiftInst("vsri", "NVsriQ", unsignedTypes, 4, vsriCode, True) + + vshlCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1; + else + destElem = srcElem1 << imm; + ''' + twoRegShiftInst("vshl", "NVshlD", unsignedTypes, 2, vshlCode) + twoRegShiftInst("vshl", "NVshlQ", unsignedTypes, 4, vshlCode) + + vsliCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 << imm) | (destElem & mask(imm)); + ''' + twoRegShiftInst("vsli", "NVsliD", unsignedTypes, 2, vsliCode, True) + twoRegShiftInst("vsli", "NVsliQ", unsignedTypes, 4, vsliCode, True) + + vqshlCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - imm); + if (topBits != 0 && topBits != mask(imm + 1)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshl", "NVqshlD", signedTypes, 2, vqshlCode) + twoRegShiftInst("vqshl", "NVqshlQ", signedTypes, 4, vqshlCode) + + vqshluCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshlu", "NVqshluD", unsignedTypes, 2, vqshluCode) + twoRegShiftInst("vqshlu", "NVqshluQ", unsignedTypes, 4, vqshluCode) + + vqshlusCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (srcElem1 > 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshlus", "NVqshlusD", signedTypes, 2, vqshlusCode) + twoRegShiftInst("vqshlus", "NVqshlusQ", signedTypes, 4, vqshlusCode) + + vshrnCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoRegNarrowShiftInst("vshrn", "NVshrn", smallUnsignedTypes, vshrnCode) + + vrshrnCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoRegNarrowShiftInst("vrshrn", "NVrshrn", smallUnsignedTypes, vrshrnCode) + + vqshrnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrn", "NVqshrn", smallSignedTypes, vqshrnCode) + + vqshrunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrun", "NVqshrun", + smallUnsignedTypes, vqshrunCode) + + vqshrunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrun", "NVqshruns", + smallSignedTypes, vqshrunsCode) + + vqrshrnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrn", "NVqrshrn", + smallSignedTypes, vqrshrnCode) + + vqrshrunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrun", "NVqrshrun", + smallUnsignedTypes, vqrshrunCode) + + vqrshrunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrun", "NVqrshruns", + smallSignedTypes, vqrshrunsCode) + + vshllCode = ''' + if (imm >= sizeof(destElem) * 8) { + destElem = 0; + } else { + destElem = (BigElement)srcElem1 << imm; + } + ''' + twoRegLongShiftInst("vshll", "NVshll", smallTypes, vshllCode) + + vmovlCode = ''' + destElem = srcElem1; + ''' + twoRegLongShiftInst("vmovl", "NVmovl", smallTypes, vmovlCode) + + vcvt2ufxCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcElem1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + destReg = vfpFpSToFixed(srcElem1, false, false, imm); + __asm__ __volatile__("" :: "m" (destReg)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvt2ufxD", ("float",), + 2, vcvt2ufxCode, toInt = True) + twoRegShiftInst("vcvt", "NVcvt2ufxQ", ("float",), + 4, vcvt2ufxCode, toInt = True) + + vcvt2sfxCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcElem1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + destReg = vfpFpSToFixed(srcElem1, true, false, imm); + __asm__ __volatile__("" :: "m" (destReg)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvt2sfxD", ("float",), + 2, vcvt2sfxCode, toInt = True) + twoRegShiftInst("vcvt", "NVcvt2sfxQ", ("float",), + 4, vcvt2sfxCode, toInt = True) + + vcvtu2fpCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1)); + destElem = vfpUFixedToFpS(true, true, srcReg1, false, imm); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvtu2fpD", ("float",), + 2, vcvtu2fpCode, fromInt = True) + twoRegShiftInst("vcvt", "NVcvtu2fpQ", ("float",), + 4, vcvtu2fpCode, fromInt = True) + + vcvts2fpCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1)); + destElem = vfpSFixedToFpS(true, true, srcReg1, false, imm); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvts2fpD", ("float",), + 2, vcvts2fpCode, fromInt = True) + twoRegShiftInst("vcvt", "NVcvts2fpQ", ("float",), + 4, vcvts2fpCode, fromInt = True) + + vcvts2hCode = ''' + FPSCR fpscr = Fpscr; + float srcFp1 = bitsToFp(srcElem1, (float)0.0); + if (flushToZero(srcFp1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcFp1), "=m" (destElem) + : "m" (srcFp1), "m" (destElem)); + destElem = vcvtFpSFpH(fpscr, true, true, VfpRoundNearest, + fpscr.ahp, srcFp1); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vcvt", "NVcvts2h", ("uint16_t",), vcvts2hCode) + + vcvth2sCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1), "=m" (destElem) + : "m" (srcElem1), "m" (destElem)); + destElem = fpToBits(vcvtFpHFpS(fpscr, true, fpscr.ahp, srcElem1)); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegLongMiscInst("vcvt", "NVcvth2s", ("uint16_t",), vcvth2sCode) + + vrsqrteCode = ''' + destElem = unsignedRSqrtEstimate(srcElem1); + ''' + twoRegMiscInst("vrsqrte", "NVrsqrteD", ("uint32_t",), 2, vrsqrteCode) + twoRegMiscInst("vrsqrte", "NVrsqrteQ", ("uint32_t",), 4, vrsqrteCode) + + vrsqrtefpCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcReg1)) + fpscr.idc = 1; + destReg = fprSqrtEstimate(fpscr, srcReg1); + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vrsqrte", "NVrsqrteDFp", ("float",), 2, vrsqrtefpCode) + twoRegMiscInstFp("vrsqrte", "NVrsqrteQFp", ("float",), 4, vrsqrtefpCode) + + vrecpeCode = ''' + destElem = unsignedRecipEstimate(srcElem1); + ''' + twoRegMiscInst("vrecpe", "NVrecpeD", ("uint32_t",), 2, vrecpeCode) + twoRegMiscInst("vrecpe", "NVrecpeQ", ("uint32_t",), 4, vrecpeCode) + + vrecpefpCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcReg1)) + fpscr.idc = 1; + destReg = fpRecipEstimate(fpscr, srcReg1); + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vrecpe", "NVrecpeDFp", ("float",), 2, vrecpefpCode) + twoRegMiscInstFp("vrecpe", "NVrecpeQFp", ("float",), 4, vrecpefpCode) + + vrev16Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 1) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev16", "NVrev16D", ("uint8_t",), 2, vrev16Code) + twoRegMiscInst("vrev16", "NVrev16Q", ("uint8_t",), 4, vrev16Code) + vrev32Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 2) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev32", "NVrev32D", + ("uint8_t", "uint16_t"), 2, vrev32Code) + twoRegMiscInst("vrev32", "NVrev32Q", + ("uint8_t", "uint16_t"), 4, vrev32Code) + vrev64Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 3) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev64", "NVrev64D", smallUnsignedTypes, 2, vrev64Code) + twoRegMiscInst("vrev64", "NVrev64Q", smallUnsignedTypes, 4, vrev64Code) + + vpaddlCode = ''' + destElem = (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + twoRegCondenseInst("vpaddl", "NVpaddlD", smallTypes, 2, vpaddlCode) + twoRegCondenseInst("vpaddl", "NVpaddlQ", smallTypes, 4, vpaddlCode) + + vpadalCode = ''' + destElem += (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + twoRegCondenseInst("vpadal", "NVpadalD", smallTypes, 2, vpadalCode, True) + twoRegCondenseInst("vpadal", "NVpadalQ", smallTypes, 4, vpadalCode, True) + + vclsCode = ''' + unsigned count = 0; + if (srcElem1 < 0) { + srcElem1 <<= 1; + while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } else { + srcElem1 <<= 1; + while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } + destElem = count; + ''' + twoRegMiscInst("vcls", "NVclsD", signedTypes, 2, vclsCode) + twoRegMiscInst("vcls", "NVclsQ", signedTypes, 4, vclsCode) + + vclzCode = ''' + unsigned count = 0; + while (srcElem1 >= 0 && count < sizeof(Element) * 8) { + count++; + srcElem1 <<= 1; + } + destElem = count; + ''' + twoRegMiscInst("vclz", "NVclzD", signedTypes, 2, vclzCode) + twoRegMiscInst("vclz", "NVclzQ", signedTypes, 4, vclzCode) + + vcntCode = ''' + unsigned count = 0; + while (srcElem1 && count < sizeof(Element) * 8) { + count += srcElem1 & 0x1; + srcElem1 >>= 1; + } + destElem = count; + ''' + twoRegMiscInst("vcnt", "NVcntD", unsignedTypes, 2, vcntCode) + twoRegMiscInst("vcnt", "NVcntQ", unsignedTypes, 4, vcntCode) + + vmvnCode = ''' + destElem = ~srcElem1; + ''' + twoRegMiscInst("vmvn", "NVmvnD", ("uint64_t",), 2, vmvnCode) + twoRegMiscInst("vmvn", "NVmvnQ", ("uint64_t",), 4, vmvnCode) + + vqabsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegMiscInst("vqabs", "NVqabsD", signedTypes, 2, vqabsCode) + twoRegMiscInst("vqabs", "NVqabsQ", signedTypes, 4, vqabsCode) + + vqnegCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else { + destElem = -srcElem1; + } + Fpscr = fpscr; + ''' + twoRegMiscInst("vqneg", "NVqnegD", signedTypes, 2, vqnegCode) + twoRegMiscInst("vqneg", "NVqnegQ", signedTypes, 4, vqnegCode) + + vabsCode = ''' + if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + ''' + twoRegMiscInst("vabs", "NVabsD", signedTypes, 2, vabsCode) + twoRegMiscInst("vabs", "NVabsQ", signedTypes, 4, vabsCode) + vabsfpCode = ''' + union + { + uint32_t i; + float f; + } cStruct; + cStruct.f = srcReg1; + cStruct.i &= mask(sizeof(Element) * 8 - 1); + destReg = cStruct.f; + ''' + twoRegMiscInstFp("vabs", "NVabsDFp", ("float",), 2, vabsfpCode) + twoRegMiscInstFp("vabs", "NVabsQFp", ("float",), 4, vabsfpCode) + + vnegCode = ''' + destElem = -srcElem1; + ''' + twoRegMiscInst("vneg", "NVnegD", signedTypes, 2, vnegCode) + twoRegMiscInst("vneg", "NVnegQ", signedTypes, 4, vnegCode) + vnegfpCode = ''' + destReg = -srcReg1; + ''' + twoRegMiscInstFp("vneg", "NVnegDFp", ("float",), 2, vnegfpCode) + twoRegMiscInstFp("vneg", "NVnegQFp", ("float",), 4, vnegfpCode) + + vcgtCode = 'destElem = (srcElem1 > 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcgt", "NVcgtD", signedTypes, 2, vcgtCode) + twoRegMiscInst("vcgt", "NVcgtQ", signedTypes, 4, vcgtCode) + vcgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcgt", "NVcgtDFp", ("float",), + 2, vcgtfpCode, toInt = True) + twoRegMiscInstFp("vcgt", "NVcgtQFp", ("float",), + 4, vcgtfpCode, toInt = True) + + vcgeCode = 'destElem = (srcElem1 >= 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcge", "NVcgeD", signedTypes, 2, vcgeCode) + twoRegMiscInst("vcge", "NVcgeQ", signedTypes, 4, vcgeCode) + vcgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcge", "NVcgeDFp", ("float",), + 2, vcgefpCode, toInt = True) + twoRegMiscInstFp("vcge", "NVcgeQFp", ("float",), + 4, vcgefpCode, toInt = True) + + vceqCode = 'destElem = (srcElem1 == 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vceq", "NVceqD", signedTypes, 2, vceqCode) + twoRegMiscInst("vceq", "NVceqQ", signedTypes, 4, vceqCode) + vceqfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vceqFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vceq", "NVceqDFp", ("float",), + 2, vceqfpCode, toInt = True) + twoRegMiscInstFp("vceq", "NVceqQFp", ("float",), + 4, vceqfpCode, toInt = True) + + vcleCode = 'destElem = (srcElem1 <= 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcle", "NVcleD", signedTypes, 2, vcleCode) + twoRegMiscInst("vcle", "NVcleQ", signedTypes, 4, vcleCode) + vclefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcleFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcle", "NVcleDFp", ("float",), + 2, vclefpCode, toInt = True) + twoRegMiscInstFp("vcle", "NVcleQFp", ("float",), + 4, vclefpCode, toInt = True) + + vcltCode = 'destElem = (srcElem1 < 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vclt", "NVcltD", signedTypes, 2, vcltCode) + twoRegMiscInst("vclt", "NVcltQ", signedTypes, 4, vcltCode) + vcltfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcltFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vclt", "NVcltDFp", ("float",), + 2, vcltfpCode, toInt = True) + twoRegMiscInstFp("vclt", "NVcltQFp", ("float",), + 4, vcltfpCode, toInt = True) + + vswpCode = ''' + FloatRegBits mid; + for (unsigned r = 0; r < rCount; r++) { + mid = srcReg1.regs[r]; + srcReg1.regs[r] = destReg.regs[r]; + destReg.regs[r] = mid; + } + ''' + twoRegMiscScramble("vswp", "NVswpD", ("uint64_t",), 2, vswpCode) + twoRegMiscScramble("vswp", "NVswpQ", ("uint64_t",), 4, vswpCode) + + vtrnCode = ''' + Element mid; + for (unsigned i = 0; i < eCount; i += 2) { + mid = srcReg1.elements[i]; + srcReg1.elements[i] = destReg.elements[i + 1]; + destReg.elements[i + 1] = mid; + } + ''' + twoRegMiscScramble("vtrn", "NVtrnD", unsignedTypes, 2, vtrnCode) + twoRegMiscScramble("vtrn", "NVtrnQ", unsignedTypes, 4, vtrnCode) + + vuzpCode = ''' + Element mid[eCount]; + memcpy(&mid, &srcReg1, sizeof(srcReg1)); + for (unsigned i = 0; i < eCount / 2; i++) { + srcReg1.elements[i] = destReg.elements[2 * i + 1]; + srcReg1.elements[eCount / 2 + i] = mid[2 * i + 1]; + destReg.elements[i] = destReg.elements[2 * i]; + } + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[eCount / 2 + i] = mid[2 * i]; + } + ''' + twoRegMiscScramble("vuzp", "NVuzpD", unsignedTypes, 2, vuzpCode) + twoRegMiscScramble("vuzp", "NVuzpQ", unsignedTypes, 4, vuzpCode) + + vzipCode = ''' + Element mid[eCount]; + memcpy(&mid, &destReg, sizeof(destReg)); + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[2 * i] = mid[i]; + destReg.elements[2 * i + 1] = srcReg1.elements[i]; + } + for (int i = 0; i < eCount / 2; i++) { + srcReg1.elements[2 * i] = mid[eCount / 2 + i]; + srcReg1.elements[2 * i + 1] = srcReg1.elements[eCount / 2 + i]; + } + ''' + twoRegMiscScramble("vzip", "NVzipD", unsignedTypes, 2, vzipCode) + twoRegMiscScramble("vzip", "NVzipQ", unsignedTypes, 4, vzipCode) + + vmovnCode = 'destElem = srcElem1;' + twoRegNarrowMiscInst("vmovn", "NVmovn", smallUnsignedTypes, vmovnCode) + + vdupCode = 'destElem = srcElem1;' + twoRegMiscScInst("vdup", "NVdupD", smallUnsignedTypes, 2, vdupCode) + twoRegMiscScInst("vdup", "NVdupQ", smallUnsignedTypes, 4, vdupCode) + + def vdupGprInst(name, Name, types, rCount): + global header_output, exec_output + eWalkCode = ''' + RegVect destReg; + for (unsigned i = 0; i < eCount; i++) { + destReg.elements[i] = htog((Element)Op1); + } + ''' + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + vdupGprInst("vdup", "NVdupDGpr", smallUnsignedTypes, 2) + vdupGprInst("vdup", "NVdupQGpr", smallUnsignedTypes, 4) + + vmovCode = 'destElem = imm;' + oneRegImmInst("vmov", "NVmoviD", ("uint64_t",), 2, vmovCode) + oneRegImmInst("vmov", "NVmoviQ", ("uint64_t",), 4, vmovCode) + + vorrCode = 'destElem |= imm;' + oneRegImmInst("vorr", "NVorriD", ("uint64_t",), 2, vorrCode, True) + oneRegImmInst("vorr", "NVorriQ", ("uint64_t",), 4, vorrCode, True) + + vmvnCode = 'destElem = ~imm;' + oneRegImmInst("vmvn", "NVmvniD", ("uint64_t",), 2, vmvnCode) + oneRegImmInst("vmvn", "NVmvniQ", ("uint64_t",), 4, vmvnCode) + + vbicCode = 'destElem &= ~imm;' + oneRegImmInst("vbic", "NVbiciD", ("uint64_t",), 2, vbicCode, True) + oneRegImmInst("vbic", "NVbiciQ", ("uint64_t",), 4, vbicCode, True) + + vqmovnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovn", "NVqmovn", smallSignedTypes, vqmovnCode) + + vqmovunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovun", "NVqmovun", + smallUnsignedTypes, vqmovunCode) + + vqmovunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if (srcElem1 < 0 || + ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + if (srcElem1 < 0) + destElem = ~destElem; + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovun", "NVqmovuns", + smallSignedTypes, vqmovunsCode) + + def buildVext(name, Name, types, rCount, op): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + vextCode = ''' + for (unsigned i = 0; i < eCount; i++) { + unsigned index = i + imm; + if (index < eCount) { + destReg.elements[i] = srcReg1.elements[index]; + } else { + index -= eCount; + assert(index < eCount); + destReg.elements[i] = srcReg2.elements[index]; + } + } + ''' + buildVext("vext", "NVextD", ("uint8_t",), 2, vextCode) + buildVext("vext", "NVextQ", ("uint8_t",), 4, vextCode) + + def buildVtbxl(name, Name, length, isVtbl): + global header_output, decoder_output, exec_output + code = ''' + union + { + uint8_t bytes[32]; + FloatRegBits regs[8]; + } table; + + union + { + uint8_t bytes[8]; + FloatRegBits regs[2]; + } destReg, srcReg2; + + const unsigned length = %(length)d; + const bool isVtbl = %(isVtbl)s; + + srcReg2.regs[0] = htog(FpOp2P0.uw); + srcReg2.regs[1] = htog(FpOp2P1.uw); + + destReg.regs[0] = htog(FpDestP0.uw); + destReg.regs[1] = htog(FpDestP1.uw); + ''' % { "length" : length, "isVtbl" : isVtbl } + for reg in range(8): + if reg < length * 2: + code += 'table.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw);\n' % \ + { "reg" : reg } + else: + code += 'table.regs[%(reg)d] = 0;\n' % { "reg" : reg } + code += ''' + for (unsigned i = 0; i < sizeof(destReg); i++) { + uint8_t index = srcReg2.bytes[i]; + if (index < 8 * length) { + destReg.bytes[i] = table.bytes[index]; + } else { + if (isVtbl) + destReg.bytes[i] = 0; + // else destReg.bytes[i] unchanged + } + } + + FpDestP0.uw = gtoh(destReg.regs[0]); + FpDestP1.uw = gtoh(destReg.regs[1]); + ''' + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": code, + "predicate_test": predicateTest }, []) + header_output += RegRegRegOpDeclare.subst(iop) + decoder_output += RegRegRegOpConstructor.subst(iop) + exec_output += PredOpExecute.subst(iop) + + buildVtbxl("vtbl", "NVtbl1", 1, "true") + buildVtbxl("vtbl", "NVtbl2", 2, "true") + buildVtbxl("vtbl", "NVtbl3", 3, "true") + buildVtbxl("vtbl", "NVtbl4", 4, "true") + + buildVtbxl("vtbx", "NVtbx1", 1, "false") + buildVtbxl("vtbx", "NVtbx2", 2, "false") + buildVtbxl("vtbx", "NVtbx3", 3, "false") + buildVtbxl("vtbx", "NVtbx4", 4, "false") +}}; diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa index a086bb03c..5490a28e0 100644 --- a/src/arch/arm/isa/operands.isa +++ b/src/arch/arm/isa/operands.isa @@ -47,6 +47,7 @@ def operand_types {{ 'sw' : ('signed int', 32), 'uw' : ('unsigned int', 32), 'ud' : ('unsigned int', 64), + 'tud' : ('twin64 int', 64), 'sf' : ('float', 32), 'df' : ('float', 64) }}; @@ -96,6 +97,18 @@ def operands {{ 'FpDestP1': ('FloatReg', 'sf', '(dest + 1)', 'IsFloating', 2), 'FpDestP2': ('FloatReg', 'sf', '(dest + 2)', 'IsFloating', 2), 'FpDestP3': ('FloatReg', 'sf', '(dest + 3)', 'IsFloating', 2), + 'FpDestP4': ('FloatReg', 'sf', '(dest + 4)', 'IsFloating', 2), + 'FpDestP5': ('FloatReg', 'sf', '(dest + 5)', 'IsFloating', 2), + 'FpDestP6': ('FloatReg', 'sf', '(dest + 6)', 'IsFloating', 2), + 'FpDestP7': ('FloatReg', 'sf', '(dest + 7)', 'IsFloating', 2), + 'FpDestS0P0': ('FloatReg', 'sf', '(dest + step * 0 + 0)', 'IsFloating', 2), + 'FpDestS0P1': ('FloatReg', 'sf', '(dest + step * 0 + 1)', 'IsFloating', 2), + 'FpDestS1P0': ('FloatReg', 'sf', '(dest + step * 1 + 0)', 'IsFloating', 2), + 'FpDestS1P1': ('FloatReg', 'sf', '(dest + step * 1 + 1)', 'IsFloating', 2), + 'FpDestS2P0': ('FloatReg', 'sf', '(dest + step * 2 + 0)', 'IsFloating', 2), + 'FpDestS2P1': ('FloatReg', 'sf', '(dest + step * 2 + 1)', 'IsFloating', 2), + 'FpDestS3P0': ('FloatReg', 'sf', '(dest + step * 3 + 0)', 'IsFloating', 2), + 'FpDestS3P1': ('FloatReg', 'sf', '(dest + step * 3 + 1)', 'IsFloating', 2), 'Result': ('IntReg', 'uw', 'result', 'IsInteger', 2, maybePCRead, maybePCWrite), 'Dest2': ('IntReg', 'uw', 'dest2', 'IsInteger', 2, @@ -124,6 +137,18 @@ def operands {{ 'FpOp1P1': ('FloatReg', 'sf', '(op1 + 1)', 'IsFloating', 2), 'FpOp1P2': ('FloatReg', 'sf', '(op1 + 2)', 'IsFloating', 2), 'FpOp1P3': ('FloatReg', 'sf', '(op1 + 3)', 'IsFloating', 2), + 'FpOp1P4': ('FloatReg', 'sf', '(op1 + 4)', 'IsFloating', 2), + 'FpOp1P5': ('FloatReg', 'sf', '(op1 + 5)', 'IsFloating', 2), + 'FpOp1P6': ('FloatReg', 'sf', '(op1 + 6)', 'IsFloating', 2), + 'FpOp1P7': ('FloatReg', 'sf', '(op1 + 7)', 'IsFloating', 2), + 'FpOp1S0P0': ('FloatReg', 'sf', '(op1 + step * 0 + 0)', 'IsFloating', 2), + 'FpOp1S0P1': ('FloatReg', 'sf', '(op1 + step * 0 + 1)', 'IsFloating', 2), + 'FpOp1S1P0': ('FloatReg', 'sf', '(op1 + step * 1 + 0)', 'IsFloating', 2), + 'FpOp1S1P1': ('FloatReg', 'sf', '(op1 + step * 1 + 1)', 'IsFloating', 2), + 'FpOp1S2P0': ('FloatReg', 'sf', '(op1 + step * 2 + 0)', 'IsFloating', 2), + 'FpOp1S2P1': ('FloatReg', 'sf', '(op1 + step * 2 + 1)', 'IsFloating', 2), + 'FpOp1S3P0': ('FloatReg', 'sf', '(op1 + step * 3 + 0)', 'IsFloating', 2), + 'FpOp1S3P1': ('FloatReg', 'sf', '(op1 + step * 3 + 1)', 'IsFloating', 2), 'MiscOp1': ('ControlReg', 'uw', 'op1', (None, None, 'IsControl'), 2), 'Op2': ('IntReg', 'uw', 'op2', 'IsInteger', 2, maybePCRead, maybePCWrite), @@ -164,6 +189,7 @@ def operands {{ maybePCRead, maybeIWPCWrite), 'Fa' : ('FloatReg', 'sf', 'ura', 'IsFloating', 2), 'Rb' : ('IntReg', 'uw', 'urb', 'IsInteger', 2, maybePCRead, maybePCWrite), + 'Rc' : ('IntReg', 'uw', 'urc', 'IsInteger', 2, maybePCRead, maybePCWrite), #General Purpose Floating Point Reg Operands 'Fd': ('FloatReg', 'df', 'FD', 'IsFloating', 2), diff --git a/src/arch/arm/isa/templates/macromem.isa b/src/arch/arm/isa/templates/macromem.isa index 400342a29..5397a2637 100644 --- a/src/arch/arm/isa/templates/macromem.isa +++ b/src/arch/arm/isa/templates/macromem.isa @@ -74,7 +74,32 @@ def template MicroMemConstructor {{ //////////////////////////////////////////////////////////////////// // -// Integer = Integer op Immediate microops +// Neon load/store microops +// + +def template MicroNeonMemDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, + RegIndex _ura, uint32_t _imm, unsigned extraMemFlags) + : %(base_class)s("%(mnemonic)s", machInst, + %(op_class)s, _dest, _ura, _imm) + { + memAccessFlags |= extraMemFlags; + %(constructor)s; + } + + %(BasicExecDeclare)s + %(InitiateAccDeclare)s + %(CompleteAccDeclare)s + }; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Integer = Integer op Integer microops // def template MicroIntDeclare {{ @@ -82,13 +107,130 @@ def template MicroIntDeclare {{ { public: %(class_name)s(ExtMachInst machInst, + RegIndex _ura, RegIndex _urb, RegIndex _urc); + %(BasicExecDeclare)s + }; +}}; + +def template MicroIntConstructor {{ + %(class_name)s::%(class_name)s(ExtMachInst machInst, + RegIndex _ura, + RegIndex _urb, + RegIndex _urc) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _ura, _urb, _urc) + { + %(constructor)s; + } +}}; + +def template MicroNeonMemExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; + template + Fault %(class_name)s<%(targs)s>::initiateAcc( + %(CPU_exec_context)s *, Trace::InstRecord *) const; + template + Fault %(class_name)s<%(targs)s>::completeAcc(PacketPtr, + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +def template MicroNeonExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (de)interlacing microops +// + +def template MicroNeonMixDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1, + uint8_t _step) : + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _step) + { + %(constructor)s; + } + + %(BasicExecDeclare)s + }; +}}; + +def template MicroNeonMixExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + uint64_t resTemp = 0; + resTemp = resTemp; + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (un)packing microops using a particular lane +// + +def template MicroNeonMixLaneDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1, + uint8_t _step, unsigned _lane) : + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _step, _lane) + { + %(constructor)s; + } + + %(BasicExecDeclare)s + }; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Integer = Integer op Immediate microops +// + +def template MicroIntImmDeclare {{ + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _ura, RegIndex _urb, uint8_t _imm); %(BasicExecDeclare)s }; }}; -def template MicroIntConstructor {{ +def template MicroIntImmConstructor {{ %(class_name)s::%(class_name)s(ExtMachInst machInst, RegIndex _ura, RegIndex _urb, @@ -132,6 +274,52 @@ def template MacroMemConstructor {{ }}; +def template VMemMultDeclare {{ +class %(class_name)s : public %(base_class)s +{ + public: + // Constructor + %(class_name)s(ExtMachInst machInst, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm); + %(BasicExecPanic)s +}; +}}; + +def template VMemMultConstructor {{ +%(class_name)s::%(class_name)s(ExtMachInst machInst, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, width, + rn, vd, regs, inc, size, align, rm) +{ + %(constructor)s; +} +}}; + +def template VMemSingleDeclare {{ +class %(class_name)s : public %(base_class)s +{ + public: + // Constructor + %(class_name)s(ExtMachInst machInst, bool all, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm, unsigned lane = 0); + %(BasicExecPanic)s +}; +}}; + +def template VMemSingleConstructor {{ +%(class_name)s::%(class_name)s(ExtMachInst machInst, bool all, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm, unsigned lane) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, all, width, + rn, vd, regs, inc, size, align, rm, lane) +{ + %(constructor)s; +} +}}; + def template MacroVFPMemDeclare {{ /** * Static instructions class for a store multiple instruction diff --git a/src/arch/arm/isa/templates/mem.isa b/src/arch/arm/isa/templates/mem.isa index 84cd1dd8f..686a8b0aa 100644 --- a/src/arch/arm/isa/templates/mem.isa +++ b/src/arch/arm/isa/templates/mem.isa @@ -180,6 +180,42 @@ def template LoadExecute {{ } }}; +def template NeonLoadExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + MemUnion memUnion; + uint8_t *dataPtr = memUnion.bytes; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + fault = xc->readBytes(EA, dataPtr, %(size)d, memAccessFlags); + %(memacc_code)s; + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExecute {{ Fault %(class_name)s::execute(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -217,6 +253,46 @@ def template StoreExecute {{ } }}; +def template NeonStoreExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + MemUnion memUnion; + uint8_t *dataPtr = memUnion.bytes; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + fault = xc->writeBytes(dataPtr, %(size)d, EA, + memAccessFlags, NULL); + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExExecute {{ Fault %(class_name)s::execute(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -336,6 +412,45 @@ def template StoreInitiateAcc {{ } }}; +def template NeonStoreInitiateAcc {{ + template <class Element> + Fault %(class_name)s<Element>::initiateAcc( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + if (%(predicate_test)s) + { + MemUnion memUnion; + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + fault = xc->writeBytes(memUnion.bytes, %(size)d, EA, + memAccessFlags, NULL); + } + + // Need to write back any potential address register update + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template LoadInitiateAcc {{ Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -363,6 +478,31 @@ def template LoadInitiateAcc {{ } }}; +def template NeonLoadInitiateAcc {{ + template <class Element> + Fault %(class_name)s<Element>::initiateAcc( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_src_decl)s; + %(op_rd)s; + %(ea_code)s; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + fault = xc->readBytes(EA, NULL, %(size)d, memAccessFlags); + } + } else if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template LoadCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, @@ -395,6 +535,40 @@ def template LoadCompleteAcc {{ } }}; +def template NeonLoadCompleteAcc {{ + template <class Element> + Fault %(class_name)s<Element>::completeAcc( + PacketPtr pkt, %(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + + %(mem_decl)s; + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + // ARM instructions will not have a pkt if the predicate is false + MemUnion &memUnion = *(MemUnion *)pkt->getPtr<uint8_t>(); + + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, @@ -420,6 +594,32 @@ def template StoreCompleteAcc {{ } }}; +def template NeonStoreCompleteAcc {{ + template <class Element> + Fault %(class_name)s<Element>::completeAcc( + PacketPtr pkt, %(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, diff --git a/src/arch/arm/isa/templates/neon.isa b/src/arch/arm/isa/templates/neon.isa new file mode 100644 index 000000000..e402979dc --- /dev/null +++ b/src/arch/arm/isa/templates/neon.isa @@ -0,0 +1,227 @@ +// -*- mode:c++ -*- + +// Copyright (c) 2010 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Gabe Black + +def template NeonRegRegRegOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _op2) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2, + uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _op2, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, IntRegIndex _dest, uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +def template NeonEqualRegExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + %(op_decl)s; + %(op_rd)s; + + const unsigned rCount = %(r_count)d; + const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element); + + union RegVect { + FloatRegBits regs[rCount]; + Element elements[eCount]; + }; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + +output header {{ + uint16_t nextBiggerType(uint8_t); + uint32_t nextBiggerType(uint16_t); + uint64_t nextBiggerType(uint32_t); + int16_t nextBiggerType(int8_t); + int32_t nextBiggerType(int16_t); + int64_t nextBiggerType(int32_t); +}}; + +def template NeonUnequalRegExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + typedef typeof(nextBiggerType((Element)0)) BigElement; + Fault fault = NoFault; + %(op_decl)s; + %(op_rd)s; + + const unsigned rCount = %(r_count)d; + const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element); + + union RegVect { + FloatRegBits regs[rCount]; + Element elements[eCount]; + BigElement bigElements[eCount / 2]; + }; + + union BigRegVect { + FloatRegBits regs[2 * rCount]; + BigElement elements[eCount]; + }; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; diff --git a/src/arch/arm/isa/templates/templates.isa b/src/arch/arm/isa/templates/templates.isa index 2584ec1f2..148139225 100644 --- a/src/arch/arm/isa/templates/templates.isa +++ b/src/arch/arm/isa/templates/templates.isa @@ -60,3 +60,6 @@ //Templates for VFP instructions ##include "vfp.isa" + +//Templates for Neon instructions +##include "neon.isa" |