diff options
-rw-r--r-- | src/arch/arm/insts/macromem.cc | 684 | ||||
-rw-r--r-- | src/arch/arm/insts/macromem.hh | 118 | ||||
-rw-r--r-- | src/arch/arm/insts/pred_inst.hh | 14 | ||||
-rw-r--r-- | src/arch/arm/insts/static_inst.hh | 22 | ||||
-rw-r--r-- | src/arch/arm/insts/vfp.cc | 330 | ||||
-rw-r--r-- | src/arch/arm/insts/vfp.hh | 123 | ||||
-rw-r--r-- | src/arch/arm/isa/decoder/thumb.isa | 2 | ||||
-rw-r--r-- | src/arch/arm/isa/formats/fp.isa | 1643 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/fp.isa | 176 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/insts.isa | 5 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/macromem.isa | 499 | ||||
-rw-r--r-- | src/arch/arm/isa/insts/neon.isa | 3343 | ||||
-rw-r--r-- | src/arch/arm/isa/operands.isa | 26 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/macromem.isa | 192 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/mem.isa | 200 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/neon.isa | 227 | ||||
-rw-r--r-- | src/arch/arm/isa/templates/templates.isa | 3 | ||||
-rw-r--r-- | src/arch/arm/tlb.hh | 10 |
18 files changed, 7069 insertions, 548 deletions
diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc index 2a2412912..5602231f9 100644 --- a/src/arch/arm/insts/macromem.cc +++ b/src/arch/arm/insts/macromem.cc @@ -137,6 +137,647 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst, } } +VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool deinterleave = (elems > 1); + + if (wb) numMicroops++; + if (deinterleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex rMid = deinterleave ? NumFloatArchRegs : vd * 2; + + uint32_t noAlign = TLB::MustBeOne; + + unsigned uopIdx = 0; + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon8Uop>( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon8Uop>( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + if (deinterleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon8Uop>( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon6Uop>( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>( + size, machInst, vd * 2, rMid, inc * 2); + microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>( + size, machInst, vd * 2 + 2, rMid + 4, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>( + size, machInst, vd * 2, rMid, inc * 2); + } + break; + default: + panic("Bad number of elements to deinterleave %d.\n", elems); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VldSingleOp::VldSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned loadSize = eBytes * elems; + unsigned loadRegs M5_VAR_USED = (loadSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(loadRegs > 0 && loadRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (loadSize) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon1Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroLdrNeon2Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroLdrNeon2Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroLdrNeon3Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon4Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroLdrNeon4Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon4Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroLdrNeon6Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroLdrNeon8Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon8Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroLdrNeon12Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroLdrNeon16Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized load size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, loadSize); + } + } + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to8Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to8Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to6Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to6Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(loadRegs <= 2); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint8_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint16_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint32_t>( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(loadRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop<uint8_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop<uint8_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop<uint16_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop<uint16_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop<uint32_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop<uint32_t>( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to unpack %d.\n", elems); + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstMultOp::VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool interleave = (elems > 1); + + if (wb) numMicroops++; + if (interleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + uint32_t noAlign = TLB::MustBeOne; + + RegIndex rMid = interleave ? NumFloatArchRegs : vd * 2; + + unsigned uopIdx = 0; + if (interleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst<MicroInterNeon8Uop>( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst<MicroInterNeon6Uop>( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>( + size, machInst, rMid, vd * 2, inc * 2); + microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>( + size, machInst, rMid + 4, vd * 2 + 2, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>( + size, machInst, rMid, vd * 2, inc * 2); + } + break; + default: + panic("Bad number of elements to interleave %d.\n", elems); + } + } + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon8Uop>( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst<MicroStrNeon8Uop>( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(!all); + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned storeSize = eBytes * elems; + unsigned storeRegs M5_VAR_USED = (storeSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(storeRegs > 0 && storeRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon8to2Uop<uint8_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon8to2Uop<uint16_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon8to4Uop<uint32_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon6to2Uop<uint8_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon6to2Uop<uint16_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon6to4Uop<uint32_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(storeRegs <= 2); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint8_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint16_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint32_t>( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(storeRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint8_t>( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint16_t>( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint32_t>( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to pack %d.\n", elems); + } + switch (storeSize) { + case 1: + microOps[uopIdx++] = new MicroStrNeon1Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroStrNeon2Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroStrNeon2Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroStrNeon3Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroStrNeon4Uop<uint8_t>( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroStrNeon4Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon4Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroStrNeon6Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroStrNeon8Uop<uint16_t>( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon8Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroStrNeon12Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroStrNeon16Uop<uint32_t>( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized store size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, storeSize); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, IntRegIndex rn, RegIndex vd, bool single, bool up, @@ -169,17 +810,25 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, bool tempUp = up; for (int j = 0; j < count; j++) { if (load) { - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroLdrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroLdrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } else { - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroStrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroStrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } if (!tempUp) { addr -= (single ? 4 : 8); @@ -216,7 +865,7 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, } std::string -MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +MicroIntImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); @@ -229,6 +878,19 @@ MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const } std::string +MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, ura); + ss << ", "; + printReg(ss, urb); + ss << ", "; + printReg(ss, urc); + return ss.str(); +} + +std::string MicroMemOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; diff --git a/src/arch/arm/insts/macromem.hh b/src/arch/arm/insts/macromem.hh index 003f5a3fd..923e9c0a1 100644 --- a/src/arch/arm/insts/macromem.hh +++ b/src/arch/arm/insts/macromem.hh @@ -80,16 +80,66 @@ class MicroOp : public PredOp }; /** + * Microops for Neon loads/stores + */ +class MicroNeonMemOp : public MicroOp +{ + protected: + RegIndex dest, ura; + uint32_t imm; + unsigned memAccessFlags; + + MicroNeonMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _dest, RegIndex _ura, uint32_t _imm) + : MicroOp(mnem, machInst, __opClass), + dest(_dest), ura(_ura), imm(_imm), + memAccessFlags(TLB::MustBeOne) + { + } +}; + +/** + * Microops for Neon load/store (de)interleaving + */ +class MicroNeonMixOp : public MicroOp +{ + protected: + RegIndex dest, op1; + uint32_t step; + + MicroNeonMixOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _dest, RegIndex _op1, uint32_t _step) + : MicroOp(mnem, machInst, __opClass), + dest(_dest), op1(_op1), step(_step) + { + } +}; + +class MicroNeonMixLaneOp : public MicroNeonMixOp +{ + protected: + unsigned lane; + + MicroNeonMixLaneOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, RegIndex _dest, RegIndex _op1, + uint32_t _step, unsigned _lane) + : MicroNeonMixOp(mnem, machInst, __opClass, _dest, _op1, _step), + lane(_lane) + { + } +}; + +/** * Microops of the form IntRegA = IntRegB op Imm */ -class MicroIntOp : public MicroOp +class MicroIntImmOp : public MicroOp { protected: RegIndex ura, urb; uint8_t imm; - MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, - RegIndex _ura, RegIndex _urb, uint8_t _imm) + MicroIntImmOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _ura, RegIndex _urb, uint8_t _imm) : MicroOp(mnem, machInst, __opClass), ura(_ura), urb(_urb), imm(_imm) { @@ -99,9 +149,27 @@ class MicroIntOp : public MicroOp }; /** + * Microops of the form IntRegA = IntRegB op IntRegC + */ +class MicroIntOp : public MicroOp +{ + protected: + RegIndex ura, urb, urc; + + MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _ura, RegIndex _urb, RegIndex _urc) + : MicroOp(mnem, machInst, __opClass), + ura(_ura), urb(_urb), urc(_urc) + { + } + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + +/** * Memory microops which use IntReg + Imm addressing */ -class MicroMemOp : public MicroIntOp +class MicroMemOp : public MicroIntImmOp { protected: bool up; @@ -109,7 +177,7 @@ class MicroMemOp : public MicroIntOp MicroMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, RegIndex _ura, RegIndex _urb, bool _up, uint8_t _imm) - : MicroIntOp(mnem, machInst, __opClass, _ura, _urb, _imm), + : MicroIntImmOp(mnem, machInst, __opClass, _ura, _urb, _imm), up(_up), memAccessFlags(TLB::MustBeOne | TLB::AlignWord) { } @@ -129,6 +197,46 @@ class MacroMemOp : public PredMacroOp }; /** + * Base classes for microcoded integer memory instructions. + */ +class VldMultOp : public PredMacroOp +{ + protected: + VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm); +}; + +class VldSingleOp : public PredMacroOp +{ + protected: + VldSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + bool all, unsigned elems, RegIndex rn, RegIndex vd, + unsigned regs, unsigned inc, uint32_t size, + uint32_t align, RegIndex rm, unsigned lane); +}; + +/** + * Base class for microcoded integer memory instructions. + */ +class VstMultOp : public PredMacroOp +{ + protected: + VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned width, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm); +}; + +class VstSingleOp : public PredMacroOp +{ + protected: + VstSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + bool all, unsigned elems, RegIndex rn, RegIndex vd, + unsigned regs, unsigned inc, uint32_t size, + uint32_t align, RegIndex rm, unsigned lane); +}; + +/** * Base class for microcoded floating point memory instructions. */ class MacroVFPMemOp : public PredMacroOp diff --git a/src/arch/arm/insts/pred_inst.hh b/src/arch/arm/insts/pred_inst.hh index 2cb383ad3..b7d4c4709 100644 --- a/src/arch/arm/insts/pred_inst.hh +++ b/src/arch/arm/insts/pred_inst.hh @@ -118,24 +118,26 @@ simd_modified_imm(bool op, uint8_t cmode, uint8_t data) break; case 0xe: if (op) { - bigData = (bigData << 0) | (bigData << 8) | - (bigData << 16) | (bigData << 24) | - (bigData << 32) | (bigData << 40) | - (bigData << 48) | (bigData << 56); - } else { bigData = 0; for (int i = 7; i >= 0; i--) { if (bits(data, i)) { - bigData |= (0xFF << (i * 8)); + bigData |= (ULL(0xFF) << (i * 8)); } } + } else { + bigData = (bigData << 0) | (bigData << 8) | + (bigData << 16) | (bigData << 24) | + (bigData << 32) | (bigData << 40) | + (bigData << 48) | (bigData << 56); } + break; case 0xf: if (!op) { uint64_t bVal = bits(bigData, 6) ? (0x1F) : (0x20); bigData = (bits(bigData, 5, 0) << 19) | (bVal << 25) | (bits(bigData, 7) << 31); bigData |= (bigData << 32); + break; } // Fall through default: diff --git a/src/arch/arm/insts/static_inst.hh b/src/arch/arm/insts/static_inst.hh index 5a1993b86..e98f85a3b 100644 --- a/src/arch/arm/insts/static_inst.hh +++ b/src/arch/arm/insts/static_inst.hh @@ -251,6 +251,28 @@ class ArmStaticInst : public StaticInst } } + template<class T, class E> + static inline T + cSwap(T val, bool big) + { + const unsigned count = sizeof(T) / sizeof(E); + union { + T tVal; + E eVals[count]; + } conv; + conv.tVal = htog(val); + if (big) { + for (unsigned i = 0; i < count; i++) { + conv.eVals[i] = gtobe(conv.eVals[i]); + } + } else { + for (unsigned i = 0; i < count; i++) { + conv.eVals[i] = gtole(conv.eVals[i]); + } + } + return gtoh(conv.tVal); + } + // Perform an interworking branch. template<class XC> static inline void diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc index 1968a59a9..f689204d9 100644 --- a/src/arch/arm/insts/vfp.cc +++ b/src/arch/arm/insts/vfp.cc @@ -91,6 +91,20 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const return ss.str(); } +std::string +FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, dest + FP_Base_DepTag); + ss << ", "; + printReg(ss, op1 + FP_Base_DepTag); + ss << ", "; + printReg(ss, op2 + FP_Base_DepTag); + ccprintf(ss, ", #%d", imm); + return ss.str(); +} + namespace ArmISA { @@ -117,7 +131,7 @@ prepFpState(uint32_t rMode) } void -finishVfp(FPSCR &fpscr, VfpSavedState state) +finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush) { int exceptions = fetestexcept(FeAllExceptions); bool underflow = false; @@ -134,7 +148,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state) underflow = true; fpscr.ufc = 1; } - if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) { + if ((exceptions & FeInexact) && !(underflow && flush)) { fpscr.ixc = 1; } fesetround(state); @@ -142,7 +156,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state) template <class fpType> fpType -fixDest(FPSCR fpscr, fpType val, fpType op1) +fixDest(bool flush, bool defaultNan, fpType val, fpType op1) { int fpClass = std::fpclassify(val); fpType junk = 0.0; @@ -150,12 +164,12 @@ fixDest(FPSCR fpscr, fpType val, fpType op1) const bool single = (sizeof(val) == sizeof(float)); const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan = std::isnan(op1); - if (!nan || (fpscr.dn == 1)) { + if (!nan || defaultNan) { val = bitsToFp(qnan, junk); } else if (nan) { val = bitsToFp(fpToBits(op1) | qnan, junk); } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + } else if (fpClass == FP_SUBNORMAL && flush == 1) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); @@ -166,13 +180,13 @@ fixDest(FPSCR fpscr, fpType val, fpType op1) } template -float fixDest<float>(FPSCR fpscr, float val, float op1); +float fixDest<float>(bool flush, bool defaultNan, float val, float op1); template -double fixDest<double>(FPSCR fpscr, double val, double op1); +double fixDest<double>(bool flush, bool defaultNan, double val, double op1); template <class fpType> fpType -fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { int fpClass = std::fpclassify(val); fpType junk = 0.0; @@ -183,7 +197,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + if ((!nan1 && !nan2) || defaultNan) { val = bitsToFp(qnan, junk); } else if (signal1) { val = bitsToFp(fpToBits(op1) | qnan, junk); @@ -194,7 +208,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } else if (nan2) { val = op2; } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + } else if (fpClass == FP_SUBNORMAL && flush) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); @@ -205,15 +219,17 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } template -float fixDest<float>(FPSCR fpscr, float val, float op1, float op2); +float fixDest<float>(bool flush, bool defaultNan, + float val, float op1, float op2); template -double fixDest<double>(FPSCR fpscr, double val, double op1, double op2); +double fixDest<double>(bool flush, bool defaultNan, + double val, double op1, double op2); template <class fpType> fpType -fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { - fpType mid = fixDest(fpscr, val, op1, op2); + fpType mid = fixDest(flush, defaultNan, val, op1, op2); const bool single = (sizeof(fpType) == sizeof(float)); const fpType junk = 0.0; if ((single && (val == bitsToFp(0x00800000, junk) || @@ -228,7 +244,7 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) temp = op1 / op2; if (flushToZero(temp)) { feraiseexcept(FeUnderflow); - if (fpscr.fz) { + if (flush) { feclearexcept(FeInexact); mid = temp; } @@ -239,9 +255,11 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } template -float fixDivDest<float>(FPSCR fpscr, float val, float op1, float op2); +float fixDivDest<float>(bool flush, bool defaultNan, + float val, float op1, float op2); template -double fixDivDest<double>(FPSCR fpscr, double val, double op1, double op2); +double fixDivDest<double>(bool flush, bool defaultNan, + double val, double op1, double op2); float fixFpDFpSDest(FPSCR fpscr, double val) @@ -255,7 +273,7 @@ fixFpDFpSDest(FPSCR fpscr, double val) (bits(valBits, 63) << 31); op1 = bitsToFp(op1Bits, junk); } - float mid = fixDest(fpscr, (float)val, op1); + float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == (FeUnderflow | FeInexact)) { feclearexcept(FeInexact); @@ -291,7 +309,7 @@ fixFpSFpDDest(FPSCR fpscr, float val) ((uint64_t)bits(valBits, 31) << 63); op1 = bitsToFp(op1Bits, junk); } - double mid = fixDest(fpscr, (double)val, op1); + double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); if (mid == bitsToFp(ULL(0x0010000000000000), junk) || mid == bitsToFp(ULL(0x8010000000000000), junk)) { __asm__ __volatile__("" : "=m" (val) : "m" (val)); @@ -311,11 +329,10 @@ fixFpSFpDDest(FPSCR fpscr, float val) return mid; } -float -vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) +uint16_t +vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, float op) { - float junk = 0.0; - uint32_t destBits = fpToBits(dest); uint32_t opBits = fpToBits(op); // Extract the operand. bool neg = bits(opBits, 31); @@ -331,11 +348,11 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) // Signalling nan. fpscr.ioc = 1; } - if (fpscr.ahp) { + if (ahp) { mantissa = 0; exponent = 0; fpscr.ioc = 1; - } else if (fpscr.dn) { + } else if (defaultNan) { mantissa = (1 << 9); exponent = 0x1f; neg = false; @@ -346,7 +363,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } else { // Infinities. exponent = 0x1F; - if (fpscr.ahp) { + if (ahp) { fpscr.ioc = 1; mantissa = 0x3ff; } else { @@ -364,14 +381,14 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) // Denormalized. // If flush to zero is on, this shouldn't happen. - assert(fpscr.fz == 0); + assert(!flush); // Check for underflow if (inexact || fpscr.ufe) fpscr.ufc = 1; // Handle rounding. - unsigned mode = fpscr.rMode; + unsigned mode = rMode; if ((mode == VfpRoundUpward && !neg && extra) || (mode == VfpRoundDown && neg && extra) || (mode == VfpRoundNearest && @@ -416,7 +433,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } // Handle rounding. - unsigned mode = fpscr.rMode; + unsigned mode = rMode; bool nonZero = topOne || !restZeros; if ((mode == VfpRoundUpward && !neg && nonZero) || (mode == VfpRoundDown && neg && nonZero) || @@ -432,7 +449,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } // Deal with overflow - if (fpscr.ahp) { + if (ahp) { if (exponent >= 0x20) { exponent = 0x1f; mantissa = 0x3ff; @@ -468,27 +485,17 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) replaceBits(result, 14, 10, exponent); if (neg) result |= (1 << 15); - if (top) - replaceBits(destBits, 31, 16, result); - else - replaceBits(destBits, 15, 0, result); - return bitsToFp(destBits, junk); + return result; } float -vcvtFpHFpS(FPSCR &fpscr, float op, bool top) +vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) { float junk = 0.0; - uint32_t opBits = fpToBits(op); - // Extract the operand. - if (top) - opBits = bits(opBits, 31, 16); - else - opBits = bits(opBits, 15, 0); // Extract the bitfields. - bool neg = bits(opBits, 15); - uint32_t exponent = bits(opBits, 14, 10); - uint32_t mantissa = bits(opBits, 9, 0); + bool neg = bits(op, 15); + uint32_t exponent = bits(op, 14, 10); + uint32_t mantissa = bits(op, 9, 0); // Do the conversion. if (exponent == 0) { if (mantissa != 0) { @@ -500,7 +507,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top) } } mantissa = mantissa << (23 - 10); - } else if (exponent == 0x1f && !fpscr.ahp) { + } else if (exponent == 0x1f && !ahp) { // Infinities and nans. exponent = 0xff; if (mantissa != 0) { @@ -511,7 +518,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top) fpscr.ioc = 1; mantissa |= (1 << 22); } - if (fpscr.dn) { + if (defaultNan) { mantissa &= ~mask(22); neg = false; } @@ -624,7 +631,8 @@ vfpFpSToFixed(float val, bool isSigned, bool half, } float -vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +vfpUFixedToFpS(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -633,11 +641,12 @@ vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } float -vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +vfpSFixedToFpS(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -646,7 +655,7 @@ vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } uint64_t @@ -743,7 +752,8 @@ vfpFpDToFixed(double val, bool isSigned, bool half, } double -vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +vfpUFixedToFpD(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -752,11 +762,12 @@ vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); } double -vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +vfpSFixedToFpD(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -765,14 +776,211 @@ vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); +} + +// This function implements a magic formula taken from the architecture +// reference manual. It was originally called recip_sqrt_estimate. +static double +recipSqrtEstimate(double a) +{ + int64_t q0, q1, s; + double r; + if (a < 0.5) { + q0 = (int64_t)(a * 512.0); + r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); + } else { + q1 = (int64_t)(a * 256.0); + r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); + } + s = (int64_t)(256.0 * r + 0.5); + return (double)s / 256.0; } +// This function is only intended for use in Neon instructions because +// it ignores certain bits in the FPSCR. +float +fprSqrtEstimate(FPSCR &fpscr, float op) +{ + const uint32_t qnan = 0x7fc00000; + float junk = 0.0; + int fpClass = std::fpclassify(op); + if (fpClass == FP_NAN) { + if ((fpToBits(op) & qnan) != qnan) + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_ZERO) { + fpscr.dzc = 1; + // Return infinity with the same sign as the operand. + return bitsToFp((std::signbit(op) << 31) | + (0xFF << 23) | (0 << 0), junk); + } else if (std::signbit(op)) { + // Set invalid op bit. + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_INFINITE) { + return 0.0; + } else { + uint64_t opBits = fpToBits(op); + double scaled; + if (bits(opBits, 23)) { + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), + (double)0.0); + } else { + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), + (double)0.0); + } + uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; + + uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); + + return bitsToFp((bits(estimate, 63) << 31) | + (bits(resultExp, 7, 0) << 23) | + (bits(estimate, 51, 29) << 0), junk); + } +} + +uint32_t +unsignedRSqrtEstimate(uint32_t op) +{ + if (bits(op, 31, 30) == 0) { + return -1; + } else { + double dpOp; + if (bits(op, 31)) { + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fe) << 52) | + (bits((uint64_t)op, 30, 0) << 21) | + (0 << 0), (double)0.0); + } else { + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fd) << 52) | + (bits((uint64_t)op, 29, 0) << 22) | + (0 << 0), (double)0.0); + } + uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); + return (1 << 31) | bits(estimate, 51, 21); + } +} + +// This function implements a magic formula taken from the architecture +// reference manual. It was originally called recip_estimate. + +static double +recipEstimate(double a) +{ + int64_t q, s; + double r; + q = (int64_t)(a * 512.0); + r = 1.0 / (((double)q + 0.5) / 512.0); + s = (int64_t)(256.0 * r + 0.5); + return (double)s / 256.0; +} + +// This function is only intended for use in Neon instructions because +// it ignores certain bits in the FPSCR. +float +fpRecipEstimate(FPSCR &fpscr, float op) +{ + const uint32_t qnan = 0x7fc00000; + float junk = 0.0; + int fpClass = std::fpclassify(op); + if (fpClass == FP_NAN) { + if ((fpToBits(op) & qnan) != qnan) + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_INFINITE) { + return bitsToFp(std::signbit(op) << 31, junk); + } else if (fpClass == FP_ZERO) { + fpscr.dzc = 1; + // Return infinity with the same sign as the operand. + return bitsToFp((std::signbit(op) << 31) | + (0xFF << 23) | (0 << 0), junk); + } else if (fabs(op) >= pow(2.0, 126)) { + fpscr.ufc = 1; + return bitsToFp(std::signbit(op) << 31, junk); + } else { + uint64_t opBits = fpToBits(op); + double scaled; + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fe) << 52) | (ULL(0) << 63), + (double)0.0); + uint64_t resultExp = 253 - bits(opBits, 30, 23); + + uint64_t estimate = fpToBits(recipEstimate(scaled)); + + return bitsToFp((bits(opBits, 31) << 31) | + (bits(resultExp, 7, 0) << 23) | + (bits(estimate, 51, 29) << 0), junk); + } +} + +uint32_t +unsignedRecipEstimate(uint32_t op) +{ + if (bits(op, 31) == 0) { + return -1; + } else { + double dpOp; + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fe) << 52) | + (bits((uint64_t)op, 30, 0) << 21) | + (0 << 0), (double)0.0); + uint64_t estimate = fpToBits(recipEstimate(dpOp)); + return (1 << 31) | bits(estimate, 51, 21); + } +} + +template <class fpType> +fpType +FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + fpType op1, fpType op2) const +{ + done = true; + fpType junk = 0.0; + fpType dest = 0.0; + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan1 = std::isnan(op1); + const bool nan2 = std::isnan(op2); + const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); + const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); + if (nan1 || nan2) { + if (defaultNan) { + dest = bitsToFp(qnan, junk); + } else if (signal1) { + dest = bitsToFp(fpToBits(op1) | qnan, junk); + } else if (signal2) { + dest = bitsToFp(fpToBits(op2) | qnan, junk); + } else if (nan1) { + dest = op1; + } else if (nan2) { + dest = op2; + } + if (signal1 || signal2) { + fpscr.ioc = 1; + } + } else { + done = false; + } + return dest; +} + +template +float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + float op1, float op2) const; +template +double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + double op1, double op2) const; + template <class fpType> fpType FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), - bool flush, uint32_t rMode) const + bool flush, bool defaultNan, uint32_t rMode) const { const bool single = (sizeof(fpType) == sizeof(float)); fpType junk = 0.0; @@ -795,7 +1003,7 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + if ((!nan1 && !nan2) || (defaultNan == 1)) { dest = bitsToFp(qnan, junk); } else if (signal1) { dest = bitsToFp(fpToBits(op1) | qnan, junk); @@ -828,18 +1036,18 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, dest = temp; } } - finishVfp(fpscr, state); + finishVfp(fpscr, state, flush); return dest; } template float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, float (*func)(float, float), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, double (*func)(double, double), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template <class fpType> fpType @@ -890,7 +1098,7 @@ FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), dest = temp; } } - finishVfp(fpscr, state); + finishVfp(fpscr, state, flush); return dest; } diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 57636bbfc..964b62673 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -192,10 +192,20 @@ bitsToFp(uint64_t bits, double junk) return val.fp; } +template <class fpType> +static bool +isSnan(fpType val) +{ + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + return std::isnan(val) && ((fpToBits(val) & qnan) != qnan); +} + typedef int VfpSavedState; VfpSavedState prepFpState(uint32_t rMode); -void finishVfp(FPSCR &fpscr, VfpSavedState state); +void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush); template <class fpType> fpType fixDest(FPSCR fpscr, fpType val, fpType op1); @@ -209,8 +219,9 @@ fpType fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2); float fixFpDFpSDest(FPSCR fpscr, double val); double fixFpSFpDDest(FPSCR fpscr, float val); -float vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top); -float vcvtFpHFpS(FPSCR &fpscr, float op, bool top); +uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, float op); +float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op); static inline double makeDouble(uint32_t low, uint32_t high) @@ -233,13 +244,23 @@ highFromDouble(double val) uint64_t vfpFpSToFixed(float val, bool isSigned, bool half, uint8_t imm, bool rzero = true); -float vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); -float vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm); +float vfpUFixedToFpS(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm); +float vfpSFixedToFpS(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm); uint64_t vfpFpDToFixed(double val, bool isSigned, bool half, uint8_t imm, bool rzero = true); -double vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); -double vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm); +double vfpUFixedToFpD(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm); +double vfpSFixedToFpD(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm); + +float fprSqrtEstimate(FPSCR &fpscr, float op); +uint32_t unsignedRSqrtEstimate(uint32_t op); + +float fpRecipEstimate(FPSCR &fpscr, float op); +uint32_t unsignedRecipEstimate(uint32_t op); class VfpMacroOp : public PredMacroOp { @@ -312,6 +333,66 @@ fpMulD(double a, double b) return a * b; } +static inline float +fpMaxS(float a, float b) +{ + // Handle comparisons of +0 and -0. + if (!std::signbit(a) && std::signbit(b)) + return a; + return fmaxf(a, b); +} + +static inline float +fpMinS(float a, float b) +{ + // Handle comparisons of +0 and -0. + if (std::signbit(a) && !std::signbit(b)) + return a; + return fminf(a, b); +} + +static inline float +fpRSqrtsS(float a, float b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + float aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 1.5; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 1.5; + } + return (3.0 - (a * b)) / 2.0; +} + +static inline float +fpRecpsS(float a, float b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + float aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 2.0; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 2.0; + } + return 2.0 - (a * b); +} + class FpOp : public PredOp { protected: @@ -364,9 +445,14 @@ class FpOp : public PredOp template <class fpType> fpType + processNans(FPSCR &fpscr, bool &done, bool defaultNan, + fpType op1, fpType op2) const; + + template <class fpType> + fpType binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template <class fpType> fpType @@ -445,6 +531,27 @@ class FpRegRegRegOp : public FpOp std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; }; +class FpRegRegRegImmOp : public FpOp +{ + protected: + IntRegIndex dest; + IntRegIndex op1; + IntRegIndex op2; + uint64_t imm; + + FpRegRegRegImmOp(const char *mnem, ExtMachInst _machInst, + OpClass __opClass, IntRegIndex _dest, + IntRegIndex _op1, IntRegIndex _op2, + uint64_t _imm, VfpMicroMode mode = VfpNotAMicroop) : + FpOp(mnem, _machInst, __opClass), + dest(_dest), op1(_op1), op2(_op2), imm(_imm) + { + setVfpMicroFlags(mode, flags); + } + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + } #endif //__ARCH_ARM_INSTS_VFP_HH__ diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa index 65ea7e30c..d0f5b8646 100644 --- a/src/arch/arm/isa/decoder/thumb.isa +++ b/src/arch/arm/isa/decoder/thumb.isa @@ -88,7 +88,7 @@ decode BIGTHUMB { 0xf: McrMrc15::mcrMrc15(); } } - 0x3: WarnUnimpl::Advanced_SIMD(); + 0x3: ThumbNeonData::ThumbNeonData(); default: decode LTCOPROC { 0xa, 0xb: ExtensionRegLoadStore::extensionRegLoadStre(); 0xf: decode HTOPCODE_9_4 { diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa index 0a5f77e6e..1482c2119 100644 --- a/src/arch/arm/isa/formats/fp.isa +++ b/src/arch/arm/isa/formats/fp.isa @@ -45,6 +45,52 @@ // Floating Point operate instructions // +output header {{ + + template<template <typename T> class Base> + StaticInstPtr + newNeonMemInst(const unsigned size, + const ExtMachInst &machInst, + const RegIndex dest, const RegIndex ra, + const uint32_t imm, const unsigned extraMemFlags) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, ra, imm, extraMemFlags); + case 1: + return new Base<uint16_t>(machInst, dest, ra, imm, extraMemFlags); + case 2: + return new Base<uint32_t>(machInst, dest, ra, imm, extraMemFlags); + case 3: + return new Base<uint64_t>(machInst, dest, ra, imm, extraMemFlags); + default: + panic("Unrecognized width %d for Neon mem inst.\n", (1 << size)); + } + } + + template<template <typename T> class Base> + StaticInstPtr + newNeonMixInst(const unsigned size, + const ExtMachInst &machInst, + const RegIndex dest, const RegIndex op1, + const uint32_t step) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, step); + case 1: + return new Base<uint16_t>(machInst, dest, op1, step); + case 2: + return new Base<uint32_t>(machInst, dest, op1, step); + case 3: + return new Base<uint64_t>(machInst, dest, op1, step); + default: + panic("Unrecognized width %d for Neon mem inst.\n", (1 << size)); + } + } + +}}; + let {{ header_output = ''' StaticInstPtr @@ -59,116 +105,233 @@ let {{ decodeNeonMem(ExtMachInst machInst) { const uint32_t b = bits(machInst, 11, 8); - const bool a = bits(machInst, 23); - const bool l = bits(machInst, 21); + const bool single = bits(machInst, 23); + const bool singleAll = single && (bits(b, 3, 2) == 3); + const bool load = bits(machInst, 21); - if (l) { - // Load instructions. - if (a) { - if (bits(b, 3, 2) != 3) { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vld1 single", machInst); - case 0x1: - return new WarnUnimplemented("vld2 single", machInst); - case 0x2: - return new WarnUnimplemented("vld3 single", machInst); - case 0x3: - return new WarnUnimplemented("vld4 single", machInst); - } - } else { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vld1 single all", - machInst); - case 0x1: - return new WarnUnimplemented("vld2 single all", - machInst); - case 0x2: - return new WarnUnimplemented("vld3 single all", - machInst); - case 0x3: - return new WarnUnimplemented("vld4 single all", - machInst); + unsigned width = 0; + + if (single) { + width = bits(b, 1, 0) + 1; + } else { + switch (bits(b, 3, 1)) { + case 0x0: width = 4; + break; + case 0x1: width = (b & 0x1) ? 2 : 1; + break; + case 0x2: width = 3; + break; + case 0x3: width = 1; + break; + case 0x4: width = 2; + break; + case 0x5: + if ((b & 0x1) == 0) { + width = 1; + break; + } + // Fall through on purpose. + default: + return new Unknown(machInst); + } + } + assert(width > 0 && width <= 4); + + const RegIndex rm = (RegIndex)(uint32_t)bits(machInst, 3, 0); + const RegIndex rn = (RegIndex)(uint32_t)bits(machInst, 19, 16); + const RegIndex vd = (RegIndex)(uint32_t)(bits(machInst, 15, 12) | + bits(machInst, 22) << 4); + const uint32_t type = bits(machInst, 11, 8); + uint32_t size = 0; + uint32_t align = 0; + unsigned inc = 1; + unsigned regs = 1; + unsigned lane = 0; + if (single) { + if (singleAll) { + size = bits(machInst, 7, 6); + bool t = bits(machInst, 5); + unsigned eBytes = (1 << size); + align = (eBytes - 1) | TLB::AllowUnaligned; + if (width == 1) { + regs = t ? 2 : 1; + inc = 1; + } else { + regs = width; + inc = t ? 2 : 1; + } + switch (width) { + case 1: + case 2: + if (bits(machInst, 4)) + align = width * eBytes - 1; + break; + case 3: + break; + case 4: + if (size == 3) { + if (bits(machInst, 4) == 0) + return new Unknown(machInst); + size = 2; + align = 0xf; + } else if (size == 2) { + if (bits(machInst, 4)) + align = 7; + } else { + if (bits(machInst, 4)) + align = 4 * eBytes - 1; } + break; } } else { - switch (bits(b, 3, 1)) { - case 0x0: - return new WarnUnimplemented("vld4 multiple", machInst); - case 0x2: - return new WarnUnimplemented("vld3 multiple", machInst); - case 0x3: - return new WarnUnimplemented("vld1 multiple", machInst); - case 0x4: - return new WarnUnimplemented("vld2 multiple", machInst); - case 0x1: - if (b & 0x1) { - return new WarnUnimplemented("vld2 multiple", machInst); - } else { - return new WarnUnimplemented("vld1 multiple", machInst); + size = bits(machInst, 11, 10); + unsigned eBytes = (1 << size); + align = (eBytes - 1) | TLB::AllowUnaligned; + regs = width; + unsigned indexAlign = bits(machInst, 7, 4); + // If width is 1, inc is always 1. That's overridden later. + switch (size) { + case 0: + inc = 1; + lane = bits(indexAlign, 3, 1); + break; + case 1: + inc = bits(indexAlign, 1) ? 2 : 1; + lane = bits(indexAlign, 3, 2); + break; + case 2: + inc = bits(indexAlign, 2) ? 2 : 1; + lane = bits(indexAlign, 3); + break; + } + // Override inc for width of 1. + if (width == 1) { + inc = 1; + } + switch (width) { + case 1: + switch (size) { + case 0: + break; + case 1: + if (bits(indexAlign, 0)) + align = 1; + break; + case 2: + if (bits(indexAlign, 1, 0)) + align = 3; + break; } - case 0x5: - if ((b & 0x1) == 0) { - return new WarnUnimplemented("vld1 multiple", machInst); - } else { + break; + case 2: + if (bits(indexAlign, 0)) + align = (2 * eBytes) - 1; + break; + case 3: + break; + case 4: + switch (size) { + case 0: + case 1: + if (bits(indexAlign, 0)) + align = (4 * eBytes) - 1; + break; + case 2: + if (bits(indexAlign, 0)) + align = (4 << bits(indexAlign, 1, 0)) - 1; break; } + break; } } + if (size == 0x3) { + return new Unknown(machInst); + } } else { - // Store instructions. - if (a) { - if (bits(b, 3, 2) != 3) { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vst1 single", machInst); - case 0x1: - return new WarnUnimplemented("vst2 single", machInst); - case 0x2: - return new WarnUnimplemented("vst3 single", machInst); - case 0x3: - return new WarnUnimplemented("vst4 single", machInst); - } - } else { - switch (bits(b, 1, 0)) { - case 0x0: - return new WarnUnimplemented("vst1 single all", - machInst); - case 0x1: - return new WarnUnimplemented("vst2 single all", - machInst); - case 0x2: - return new WarnUnimplemented("vst3 single all", - machInst); - case 0x3: - return new WarnUnimplemented("vst4 single all", - machInst); - } + size = bits(machInst, 7, 6); + align = bits(machInst, 5, 4); + if (align == 0) { + // @align wasn't specified, so alignment can be turned off. + align = ((1 << size) - 1) | TLB::AllowUnaligned; + } else { + align = ((4 << align) - 1); + } + switch (width) { + case 1: + switch (type) { + case 0x7: regs = 1; + break; + case 0xa: regs = 2; + break; + case 0x6: regs = 3; + break; + case 0x2: regs = 4; + break; + default: + return new Unknown(machInst); } + break; + case 2: + // Regs doesn't behave exactly as it does in the manual + // because they loop over regs registers twice and we break + // it down in the macroop. + switch (type) { + case 0x8: regs = 2; inc = 1; + break; + case 0x9: regs = 2; inc = 2; + break; + case 0x3: regs = 4; inc = 2; + break; + default: + return new Unknown(machInst); + } + break; + case 3: + regs = 3; + switch (type) { + case 0x4: inc = 1; + break; + case 0x5: inc = 2;; + break; + default: + return new Unknown(machInst); + } + break; + case 4: + regs = 4; + switch (type) { + case 0: inc = 1; + break; + case 1: inc = 2; + break; + default: + return new Unknown(machInst); + } + break; + } + } + + if (load) { + // Load instructions. + if (single) { + return new VldSingle(machInst, singleAll, width, rn, vd, + regs, inc, size, align, rm, lane); } else { - switch (bits(b, 3, 1)) { - case 0x0: - return new WarnUnimplemented("vst4 multiple", machInst); - case 0x2: - return new WarnUnimplemented("vst3 multiple", machInst); - case 0x3: - return new WarnUnimplemented("vst1 multiple", machInst); - case 0x4: - return new WarnUnimplemented("vst2 multiple", machInst); - case 0x1: - if (b & 0x1) { - return new WarnUnimplemented("vst2 multiple", machInst); - } else { - return new WarnUnimplemented("vst1 multiple", machInst); - } - case 0x5: - if ((b & 0x1) == 0) { - return new WarnUnimplemented("vst1 multiple", machInst); - } else { - break; - } + return new VldMult(machInst, width, rn, vd, + regs, inc, size, align, rm); + } + } else { + // Store instructions. + if (single) { + if (singleAll) { + return new Unknown(machInst); + } else { + return new VstSingle(machInst, false, width, rn, vd, + regs, inc, size, align, rm, lane); } + } else { + return new VstMult(machInst, width, rn, vd, + regs, inc, size, align, rm); } } return new Unknown(machInst); @@ -183,153 +346,243 @@ let {{ const uint32_t a = bits(machInst, 11, 8); const bool b = bits(machInst, 4); const uint32_t c = bits(machInst, 21, 20); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 21, 20); + const bool q = bits(machInst, 6); + if (q && ((vd & 0x1) || (vn & 0x1) || (vm & 0x1))) + return new Unknown(machInst); switch (a) { case 0x0: if (b) { - if (bits(machInst, 9) == 0) { - return new WarnUnimplemented("vhadd", machInst); + if (u) { + return decodeNeonUThreeReg<VqaddUD, VqaddUQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vhsub", machInst); + return decodeNeonSThreeReg<VqaddSD, VqaddSQ>( + q, size, machInst, vd, vn, vm); } } else { - return new WarnUnimplemented("vqadd", machInst); + if (size == 3) + return new Unknown(machInst); + return decodeNeonUSThreeReg<VhaddD, VhaddQ>( + q, u, size, machInst, vd, vn, vm); } case 0x1: if (!b) { - return new WarnUnimplemented("vrhadd", machInst); + return decodeNeonUSThreeReg<VrhaddD, VrhaddQ>( + q, u, size, machInst, vd, vn, vm); } else { if (u) { switch (c) { case 0: - return new WarnUnimplemented("veor", machInst); + if (q) { + return new VeorQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VeorD<uint64_t>(machInst, vd, vn, vm); + } case 1: - return new WarnUnimplemented("vbsl", machInst); + if (q) { + return new VbslQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbslD<uint64_t>(machInst, vd, vn, vm); + } case 2: - return new WarnUnimplemented("vbit", machInst); + if (q) { + return new VbitQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbitD<uint64_t>(machInst, vd, vn, vm); + } case 3: - return new WarnUnimplemented("vbif", machInst); + if (q) { + return new VbifQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbifD<uint64_t>(machInst, vd, vn, vm); + } } } else { switch (c) { case 0: - return new WarnUnimplemented("vand (reg)", machInst); + if (q) { + return new VandQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VandD<uint64_t>(machInst, vd, vn, vm); + } case 1: - return new WarnUnimplemented("vbic (reg)", machInst); + if (q) { + return new VbicQ<uint64_t>(machInst, vd, vn, vm); + } else { + return new VbicD<uint64_t>(machInst, vd, vn, vm); + } case 2: - { - const IntRegIndex n = (IntRegIndex)( - (uint32_t)bits(machInst, 19, 16) | - (uint32_t)(bits(machInst, 7) << 4)); - const IntRegIndex m = (IntRegIndex)( - (uint32_t)bits(machInst, 3, 0) | - (uint32_t)(bits(machInst, 5) << 4)); - if (n == m) { - return new WarnUnimplemented("vmov (reg)", - machInst); + if (vn == vm) { + if (q) { + return new VmovQ<uint64_t>( + machInst, vd, vn, vm); + } else { + return new VmovD<uint64_t>( + machInst, vd, vn, vm); + } + } else { + if (q) { + return new VorrQ<uint64_t>( + machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vorr (reg)", - machInst); + return new VorrD<uint64_t>( + machInst, vd, vn, vm); } } case 3: - return new WarnUnimplemented("vorn (reg)", machInst); + if (q) { + return new VornQ<uint64_t>( + machInst, vd, vn, vm); + } else { + return new VornD<uint64_t>( + machInst, vd, vn, vm); + } } } } case 0x2: if (b) { - return new WarnUnimplemented("vqsub", machInst); - } else { - if (bits(machInst, 9) == 0) { - return new WarnUnimplemented("vhadd", machInst); + if (u) { + return decodeNeonUThreeReg<VqsubUD, VqsubUQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vhsub", machInst); + return decodeNeonSThreeReg<VqsubSD, VqsubSQ>( + q, size, machInst, vd, vn, vm); } + } else { + if (size == 3) + return new Unknown(machInst); + return decodeNeonUSThreeReg<VhsubD, VhsubQ>( + q, u, size, machInst, vd, vn, vm); } case 0x3: if (b) { - return new WarnUnimplemented("vcge (reg)", machInst); + return decodeNeonUSThreeReg<VcgeD, VcgeQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vcgt (reg)", machInst); + return decodeNeonUSThreeReg<VcgtD, VcgtQ>( + q, u, size, machInst, vd, vn, vm); } case 0x4: if (b) { - return new WarnUnimplemented("vqshl (reg)", machInst); + if (u) { + return decodeNeonUThreeReg<VqshlUD, VqshlUQ>( + q, size, machInst, vd, vm, vn); + } else { + return decodeNeonSThreeReg<VqshlSD, VqshlSQ>( + q, size, machInst, vd, vm, vn); + } } else { - return new WarnUnimplemented("vshl (reg)", machInst); + return decodeNeonUSThreeReg<VshlD, VshlQ>( + q, u, size, machInst, vd, vm, vn); } case 0x5: if (b) { - return new WarnUnimplemented("vqrshl", machInst); + if (u) { + return decodeNeonUThreeReg<VqrshlUD, VqrshlUQ>( + q, size, machInst, vd, vm, vn); + } else { + return decodeNeonSThreeReg<VqrshlSD, VqrshlSQ>( + q, size, machInst, vd, vm, vn); + } } else { - return new WarnUnimplemented("vrshl", machInst); + return decodeNeonUSThreeReg<VrshlD, VrshlQ>( + q, u, size, machInst, vd, vm, vn); } case 0x6: if (b) { - return new WarnUnimplemented("vmin (int)", machInst); + return decodeNeonUSThreeReg<VminD, VminQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmax (int)", machInst); + return decodeNeonUSThreeReg<VmaxD, VmaxQ>( + q, u, size, machInst, vd, vn, vm); } case 0x7: if (b) { - return new WarnUnimplemented("vaba", machInst); + return decodeNeonUSThreeReg<VabaD, VabaQ>( + q, u, size, machInst, vd, vn, vm); } else { if (bits(machInst, 23) == 1) { - if (bits(machInst, 6) == 1) { + if (q) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vabdl (int)", machInst); + return decodeNeonUSThreeUSReg<Vabdl>( + u, size, machInst, vd, vn, vm); } } else { - return new WarnUnimplemented("vabd (int)", machInst); + return decodeNeonUSThreeReg<VabdD, VabdQ>( + q, u, size, machInst, vd, vn, vm); } } case 0x8: if (b) { if (u) { - return new WarnUnimplemented("vceq (reg)", machInst); + return decodeNeonUThreeReg<VceqD, VceqQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vtst", machInst); + return decodeNeonUThreeReg<VtstD, VtstQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vsub (int)", machInst); + return decodeNeonUThreeReg<NVsubD, NVsubQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vadd (int)", machInst); + return decodeNeonUThreeReg<NVaddD, NVaddQ>( + q, size, machInst, vd, vn, vm); } } case 0x9: if (b) { if (u) { - return new WarnUnimplemented("vmul (poly)", machInst); + return decodeNeonUThreeReg<NVmulpD, NVmulpQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmul (int)", machInst); + return decodeNeonSThreeReg<NVmulD, NVmulQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vmls (int)", machInst); + return decodeNeonUSThreeReg<NVmlsD, NVmlsQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vmla (int)", machInst); + return decodeNeonUSThreeReg<NVmlaD, NVmlaQ>( + q, u, size, machInst, vd, vn, vm); } } case 0xa: if (b) { - return new WarnUnimplemented("vpmin (int)", machInst); + return decodeNeonUSThreeReg<VpminD, VpminQ>( + q, u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vpmax (int)", machInst); + return decodeNeonUSThreeReg<VpmaxD, VpmaxQ>( + q, u, size, machInst, vd, vn, vm); } case 0xb: if (b) { if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vpadd (int)", machInst); + return decodeNeonUThreeReg<NVpaddD, NVpaddQ>( + q, size, machInst, vd, vn, vm); } } else { if (u) { - return new WarnUnimplemented("vqrdmulh", machInst); + return decodeNeonSThreeSReg<VqrdmulhD, VqrdmulhQ>( + q, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vqdmulh", machInst); + return decodeNeonSThreeSReg<VqdmulhD, VqdmulhQ>( + q, size, machInst, vd, vn, vm); } } case 0xc: @@ -338,29 +591,57 @@ let {{ if (b) { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmul (fp)", machInst); + if (q) { + return new NVmulQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmulDFp<float>(machInst, vd, vn, vm); + } } else { return new Unknown(machInst); } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmla (fp)", machInst); + if (q) { + return new NVmlaQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmlaDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vmls (fp)", machInst); + if (q) { + return new NVmlsQFp<float>(machInst, vd, vn, vm); + } else { + return new NVmlsDFp<float>(machInst, vd, vn, vm); + } } } } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vpadd (fp)", machInst); + if (q) { + return new VpaddQFp<float>(machInst, vd, vn, vm); + } else { + return new VpaddDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vabd (fp)", machInst); + if (q) { + return new VabdQFp<float>(machInst, vd, vn, vm); + } else { + return new VabdDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vadd (fp)", machInst); + if (q) { + return new VaddQFp<float>(machInst, vd, vn, vm); + } else { + return new VaddDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vsub (fp)", machInst); + if (q) { + return new VsubQFp<float>(machInst, vd, vn, vm); + } else { + return new VsubDFp<float>(machInst, vd, vn, vm); + } } } } @@ -368,9 +649,17 @@ let {{ if (b) { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vacge", machInst); + if (q) { + return new VacgeQFp<float>(machInst, vd, vn, vm); + } else { + return new VacgeDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vacgt", machInst); + if (q) { + return new VacgtQFp<float>(machInst, vd, vn, vm); + } else { + return new VacgtDFp<float>(machInst, vd, vn, vm); + } } } else { return new Unknown(machInst); @@ -378,13 +667,25 @@ let {{ } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vcge (reg)", machInst); + if (q) { + return new VcgeQFp<float>(machInst, vd, vn, vm); + } else { + return new VcgeDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vcgt (reg)", machInst); + if (q) { + return new VcgtQFp<float>(machInst, vd, vn, vm); + } else { + return new VcgtDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vceq (reg)", machInst); + if (q) { + return new VceqQFp<float>(machInst, vd, vn, vm); + } else { + return new VceqDFp<float>(machInst, vd, vn, vm); + } } else { return new Unknown(machInst); } @@ -396,23 +697,47 @@ let {{ return new Unknown(machInst); } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vrecps", machInst); + if (q) { + return new VrecpsQFp<float>(machInst, vd, vn, vm); + } else { + return new VrecpsDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vrsqrts", machInst); + if (q) { + return new VrsqrtsQFp<float>(machInst, vd, vn, vm); + } else { + return new VrsqrtsDFp<float>(machInst, vd, vn, vm); + } } } } else { if (u) { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vpmax (fp)", machInst); + if (q) { + return new VpmaxQFp<float>(machInst, vd, vn, vm); + } else { + return new VpmaxDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vpmin (fp)", machInst); + if (q) { + return new VpminQFp<float>(machInst, vd, vn, vm); + } else { + return new VpminDFp<float>(machInst, vd, vn, vm); + } } } else { if (bits(c, 1) == 0) { - return new WarnUnimplemented("vmax (fp)", machInst); + if (q) { + return new VmaxQFp<float>(machInst, vd, vn, vm); + } else { + return new VmaxDFp<float>(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vmin (fp)", machInst); + if (q) { + return new VminQFp<float>(machInst, vd, vn, vm); + } else { + return new VminDFp<float>(machInst, vd, vn, vm); + } } } } @@ -423,50 +748,94 @@ let {{ static StaticInstPtr decodeNeonOneRegModImm(ExtMachInst machInst) { + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const bool q = bits(machInst, 6); const bool op = bits(machInst, 5); - const uint32_t cmode = bits(machInst, 11, 8); + const uint8_t cmode = bits(machInst, 11, 8); + const uint8_t imm = ((THUMB ? bits(machInst, 28) : + bits(machInst, 24)) << 7) | + (bits(machInst, 18, 16) << 4) | + (bits(machInst, 3, 0) << 0); + const uint64_t bigImm = simd_modified_imm(op, cmode, imm); if (op) { if (bits(cmode, 3) == 0) { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vorr (imm)", machInst); + if (q) + return new NVbiciQ<uint64_t>(machInst, vd, bigImm); + else + return new NVbiciD<uint64_t>(machInst, vd, bigImm); } } else { if (bits(cmode, 2) == 1) { - return new WarnUnimplemented("vmov (imm)", machInst); + switch (bits(cmode, 1, 0)) { + case 0: + case 1: + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); + case 2: + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); + case 3: + if (q) + return new Unknown(machInst); + else + return new Unknown(machInst); + } } else { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmvniQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmvniD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vorr (imm)", machInst); + if (q) + return new NVbiciQ<uint64_t>(machInst, vd, bigImm); + else + return new NVbiciD<uint64_t>(machInst, vd, bigImm); } } } } else { if (bits(cmode, 3) == 0) { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmvn (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vbic (imm)", machInst); + if (q) + return new NVorriQ<uint64_t>(machInst, vd, bigImm); + else + return new NVorriD<uint64_t>(machInst, vd, bigImm); } } else { if (bits(cmode, 2) == 1) { - switch (bits(cmode, 1, 0)) { - case 0: - case 1: - return new WarnUnimplemented("vmvn (imm)", machInst); - case 2: - return new WarnUnimplemented("vmov (imm)", machInst); - case 3: - return new Unknown(machInst); - } - return new WarnUnimplemented("vmov (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { if (bits(cmode, 0) == 0) { - return new WarnUnimplemented("vmvn (imm)", machInst); + if (q) + return new NVmoviQ<uint64_t>(machInst, vd, bigImm); + else + return new NVmoviD<uint64_t>(machInst, vd, bigImm); } else { - return new WarnUnimplemented("vbic (imm)", machInst); + if (q) + return new NVorriQ<uint64_t>(machInst, vd, bigImm); + else + return new NVorriD<uint64_t>(machInst, vd, bigImm); } } } @@ -481,70 +850,149 @@ let {{ const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const bool b = bits(machInst, 6); const bool l = bits(machInst, 7); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + unsigned imm6 = bits(machInst, 21, 16); + unsigned imm = ((l ? 1 : 0) << 6) | imm6; + unsigned size = 3; + unsigned lShiftAmt = 0; + unsigned bitSel; + for (bitSel = 1 << 6; true; bitSel >>= 1) { + if (bitSel & imm) + break; + else if (!size) + return new Unknown(machInst); + size--; + } + lShiftAmt = imm6 & ~bitSel; + unsigned rShiftAmt = 0; + if (a != 0xe && a != 0xf) { + if (size > 2) + rShiftAmt = 64 - imm6; + else + rShiftAmt = 2 * (8 << size) - imm6; + } switch (a) { case 0x0: - return new WarnUnimplemented("vshr", machInst); + return decodeNeonUSTwoShiftReg<NVshrD, NVshrQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x1: - return new WarnUnimplemented("vsra", machInst); + return decodeNeonUSTwoShiftReg<NVsraD, NVsraQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x2: - return new WarnUnimplemented("vrshr", machInst); + return decodeNeonUSTwoShiftReg<NVrshrD, NVrshrQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x3: - return new WarnUnimplemented("vrsra", machInst); + return decodeNeonUSTwoShiftReg<NVrsraD, NVrsraQ>( + b, u, size, machInst, vd, vm, rShiftAmt); case 0x4: if (u) { - return new WarnUnimplemented("vsri", machInst); + return decodeNeonUTwoShiftReg<NVsriD, NVsriQ>( + b, size, machInst, vd, vm, rShiftAmt); } else { return new Unknown(machInst); } case 0x5: if (u) { - return new WarnUnimplemented("vsli", machInst); + return decodeNeonUTwoShiftReg<NVsliD, NVsliQ>( + b, size, machInst, vd, vm, lShiftAmt); } else { - return new WarnUnimplemented("vshl (imm)", machInst); + return decodeNeonUTwoShiftReg<NVshlD, NVshlQ>( + b, size, machInst, vd, vm, lShiftAmt); } case 0x6: case 0x7: - return new WarnUnimplemented("vqshl, vqshlu (imm)", machInst); + if (u) { + if (a == 0x6) { + return decodeNeonSTwoShiftReg<NVqshlusD, NVqshlusQ>( + b, size, machInst, vd, vm, lShiftAmt); + } else { + return decodeNeonUTwoShiftReg<NVqshluD, NVqshluQ>( + b, size, machInst, vd, vm, lShiftAmt); + } + } else { + return decodeNeonSTwoShiftReg<NVqshlD, NVqshlQ>( + b, size, machInst, vd, vm, lShiftAmt); + } case 0x8: if (l) { return new Unknown(machInst); } else if (u) { - if (b) { - return new WarnUnimplemented("vqrshrn, vqrshrun", machInst); - } else { - return new WarnUnimplemented("vqshrn, vqshrun", machInst); - } + return decodeNeonSTwoShiftSReg<NVqshruns, NVqrshruns>( + b, size, machInst, vd, vm, rShiftAmt); } else { - if (b) { - return new WarnUnimplemented("vrshrn", machInst); - } else { - return new WarnUnimplemented("vshrn", machInst); - } + return decodeNeonUTwoShiftSReg<NVshrn, NVrshrn>( + b, size, machInst, vd, vm, rShiftAmt); } case 0x9: if (l) { return new Unknown(machInst); - } else if (b) { - return new WarnUnimplemented("vqrshrn, vqrshrun", machInst); + } else if (u) { + return decodeNeonUTwoShiftSReg<NVqshrun, NVqrshrun>( + b, size, machInst, vd, vm, rShiftAmt); } else { - return new WarnUnimplemented("vqshrn, vqshrun", machInst); + return decodeNeonSTwoShiftSReg<NVqshrn, NVqrshrn>( + b, size, machInst, vd, vm, rShiftAmt); } case 0xa: if (l || b) { return new Unknown(machInst); } else { - // If the shift amount is zero, it's vmovl. - return new WarnUnimplemented("vshll, vmovl", machInst); + return decodeNeonUSTwoShiftSReg<NVmovl, NVshll>( + lShiftAmt, u, size, machInst, vd, vm, lShiftAmt); } case 0xe: + if (l) { + return new Unknown(machInst); + } else { + if (bits(imm6, 5) == 0) + return new Unknown(machInst); + if (u) { + if (b) { + return new NVcvtu2fpQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvtu2fpD<float>( + machInst, vd, vm, 64 - imm6); + } + } else { + if (b) { + return new NVcvts2fpQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvts2fpD<float>( + machInst, vd, vm, 64 - imm6); + } + } + } case 0xf: if (l) { return new Unknown(machInst); - } else if (a == 0xe) { - return new WarnUnimplemented("vcvt (fixed to fp)", machInst); - } else if (a == 0xf) { - return new WarnUnimplemented("vcvt (fp to fixed)", machInst); + } else { + if (bits(imm6, 5) == 0) + return new Unknown(machInst); + if (u) { + if (b) { + return new NVcvt2ufxQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvt2ufxD<float>( + machInst, vd, vm, 64 - imm6); + } + } else { + if (b) { + return new NVcvt2sfxQ<float>( + machInst, vd, vm, 64 - imm6); + } else { + return new NVcvt2sfxD<float>( + machInst, vd, vm, 64 - imm6); + } + } } } return new Unknown(machInst); @@ -555,74 +1003,89 @@ let {{ { const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const uint32_t a = bits(machInst, 11, 8); - + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 21, 20); switch (a) { case 0x0: - return new WarnUnimplemented("vaddl", machInst); + return decodeNeonUSThreeUSReg<Vaddl>( + u, size, machInst, vd, vn, vm); case 0x1: - return new WarnUnimplemented("vaddw", machInst); + return decodeNeonUSThreeUSReg<Vaddw>( + u, size, machInst, vd, vn, vm); case 0x2: - return new WarnUnimplemented("vsubl", machInst); + return decodeNeonUSThreeUSReg<Vsubl>( + u, size, machInst, vd, vn, vm); case 0x3: - return new WarnUnimplemented("vsubw", machInst); + return decodeNeonUSThreeUSReg<Vsubw>( + u, size, machInst, vd, vn, vm); case 0x4: if (u) { - return new WarnUnimplemented("vraddhn", machInst); + return decodeNeonUThreeUSReg<Vraddhn>( + size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vaddhn", machInst); + return decodeNeonUThreeUSReg<Vaddhn>( + size, machInst, vd, vn, vm); } case 0x5: - return new WarnUnimplemented("vabal", machInst); + return decodeNeonUSThreeUSReg<Vabal>( + u, size, machInst, vd, vn, vm); case 0x6: if (u) { - return new WarnUnimplemented("vrsubhn", machInst); + return decodeNeonUThreeUSReg<Vrsubhn>( + size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vsubhn", machInst); + return decodeNeonUThreeUSReg<Vsubhn>( + size, machInst, vd, vn, vm); } case 0x7: if (bits(machInst, 23)) { - return new WarnUnimplemented("vabdl (int)", machInst); + return decodeNeonUSThreeUSReg<Vabdl>( + u, size, machInst, vd, vn, vm); } else { - return new WarnUnimplemented("vabd (int)", machInst); + return decodeNeonUSThreeReg<VabdD, VabdQ>( + bits(machInst, 6), u, size, machInst, vd, vn, vm); } case 0x8: - return new WarnUnimplemented("vmlal (int)", machInst); + return decodeNeonUSThreeUSReg<Vmlal>( + u, size, machInst, vd, vn, vm); case 0xa: - return new WarnUnimplemented("vmlsl (int)", machInst); + return decodeNeonUSThreeUSReg<Vmlsl>( + u, size, machInst, vd, vn, vm); case 0x9: - if (bits(machInst, 23) == 0) { - if (bits(machInst, 4) == 0) { - if (u) { - return new WarnUnimplemented("vmls (int)", machInst); - } else { - return new WarnUnimplemented("vmla (int)", machInst); - } - } else { - if (u) { - return new WarnUnimplemented("vmul (poly)", machInst); - } else { - return new WarnUnimplemented("vmul (int)", machInst); - } - } + if (u) { + return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlal", machInst); + return decodeNeonSThreeUSReg<Vqdmlal>( + size, machInst, vd, vn, vm); } case 0xb: - if (!u) { + if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlsl", machInst); + return decodeNeonSThreeUSReg<Vqdmlsl>( + size, machInst, vd, vn, vm); } case 0xc: - return new WarnUnimplemented("vmull (int)", machInst); + return decodeNeonUSThreeUSReg<Vmull>( + u, size, machInst, vd, vn, vm); case 0xd: - if (!u) { + if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmull", machInst); + return decodeNeonSThreeUSReg<Vqdmull>( + size, machInst, vd, vn, vm); } case 0xe: - return new WarnUnimplemented("vmull (poly)", machInst); + return decodeNeonUThreeUSReg<Vmullp>( + size, machInst, vd, vn, vm); } return new Unknown(machInst); } @@ -632,48 +1095,256 @@ let {{ { const bool u = THUMB ? bits(machInst, 28) : bits(machInst, 24); const uint32_t a = bits(machInst, 11, 8); - + const unsigned size = bits(machInst, 21, 20); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = (size == 2) ? + (IntRegIndex)(2 * bits(machInst, 3, 0)) : + (IntRegIndex)(2 * bits(machInst, 2, 0)); + const unsigned index = (size == 2) ? (unsigned)bits(machInst, 5) : + (bits(machInst, 3) | (bits(machInst, 5) << 1)); switch (a) { case 0x0: - return new WarnUnimplemented("vmla (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmlasQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlasQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmlasD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlasD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x1: - return new WarnUnimplemented("vmla (fp scalar)", machInst); + if (u) + return new VmlasQFp<float>(machInst, vd, vn, vm, index); + else + return new VmlasDFp<float>(machInst, vd, vn, vm, index); case 0x4: - return new WarnUnimplemented("vmls (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmlssQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlssQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmlssD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmlssD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x5: - return new WarnUnimplemented("vmls (fp scalar)", machInst); + if (u) + return new VmlssQFp<float>(machInst, vd, vn, vm, index); + else + return new VmlssDFp<float>(machInst, vd, vn, vm, index); case 0x2: - return new WarnUnimplemented("vmlal (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmlals<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlals<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmlals<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlals<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x6: - return new WarnUnimplemented("vmlsl (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmlsls<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlsls<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmlsls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmlsls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x3: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlal", machInst); + switch (size) { + case 1: + return new Vqdmlals<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vqdmlals<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } } case 0x7: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmlsl", machInst); + switch (size) { + case 1: + return new Vqdmlsls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vqdmlsls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } } case 0x8: - return new WarnUnimplemented("vmul (int scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new VmulsQ<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmulsQ<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VmulsD<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new VmulsD<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0x9: - return new WarnUnimplemented("vmul (fp scalar)", machInst); + if (u) + return new VmulsQFp<float>(machInst, vd, vn, vm, index); + else + return new VmulsDFp<float>(machInst, vd, vn, vm, index); case 0xa: - return new WarnUnimplemented("vmull (scalar)", machInst); + if (u) { + switch (size) { + case 1: + return new Vmulls<uint16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmulls<uint32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vmulls<int16_t>(machInst, vd, vn, vm, index); + case 2: + return new Vmulls<int32_t>(machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0xb: if (u) { return new Unknown(machInst); } else { - return new WarnUnimplemented("vqdmull", machInst); + if (u) { + switch (size) { + case 1: + return new Vqdmulls<uint16_t>( + machInst, vd, vn, vm, index); + case 2: + return new Vqdmulls<uint32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new Vqdmulls<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new Vqdmulls<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } } case 0xc: - return new WarnUnimplemented("vqdmulh", machInst); + if (u) { + switch (size) { + case 1: + return new VqdmulhsQ<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqdmulhsQ<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VqdmulhsD<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqdmulhsD<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } case 0xd: - return new WarnUnimplemented("vqrdmulh", machInst); + if (u) { + switch (size) { + case 1: + return new VqrdmulhsQ<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqrdmulhsQ<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 1: + return new VqrdmulhsD<int16_t>( + machInst, vd, vn, vm, index); + case 2: + return new VqrdmulhsD<int32_t>( + machInst, vd, vn, vm, index); + default: + return new Unknown(machInst); + } + } } return new Unknown(machInst); } @@ -683,85 +1354,234 @@ let {{ { const uint32_t a = bits(machInst, 17, 16); const uint32_t b = bits(machInst, 10, 6); + const bool q = bits(machInst, 6); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + const unsigned size = bits(machInst, 19, 18); switch (a) { case 0x0: switch (bits(b, 4, 1)) { case 0x0: - return new WarnUnimplemented("vrev64", machInst); + switch (size) { + case 0: + if (q) { + return new NVrev64Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint8_t>(machInst, vd, vm); + } + case 1: + if (q) { + return new NVrev64Q<uint16_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint16_t>(machInst, vd, vm); + } + case 2: + if (q) { + return new NVrev64Q<uint32_t>(machInst, vd, vm); + } else { + return new NVrev64D<uint32_t>(machInst, vd, vm); + } + default: + return new Unknown(machInst); + } case 0x1: - return new WarnUnimplemented("vrev32", machInst); + switch (size) { + case 0: + if (q) { + return new NVrev32Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev32D<uint8_t>(machInst, vd, vm); + } + case 1: + if (q) { + return new NVrev32Q<uint16_t>(machInst, vd, vm); + } else { + return new NVrev32D<uint16_t>(machInst, vd, vm); + } + default: + return new Unknown(machInst); + } case 0x2: - return new WarnUnimplemented("vrev16", machInst); + if (size != 0) { + return new Unknown(machInst); + } else if (q) { + return new NVrev16Q<uint8_t>(machInst, vd, vm); + } else { + return new NVrev16D<uint8_t>(machInst, vd, vm); + } case 0x4: + return decodeNeonSTwoMiscSReg<NVpaddlD, NVpaddlQ>( + q, size, machInst, vd, vm); case 0x5: - return new WarnUnimplemented("vpaddl", machInst); + return decodeNeonUTwoMiscSReg<NVpaddlD, NVpaddlQ>( + q, size, machInst, vd, vm); case 0x8: - return new WarnUnimplemented("vcls", machInst); + return decodeNeonSTwoMiscReg<NVclsD, NVclsQ>( + q, size, machInst, vd, vm); case 0x9: - return new WarnUnimplemented("vclz", machInst); + return decodeNeonSTwoMiscReg<NVclzD, NVclzQ>( + q, size, machInst, vd, vm); case 0xa: - return new WarnUnimplemented("vcnt", machInst); + return decodeNeonUTwoMiscReg<NVcntD, NVcntQ>( + q, size, machInst, vd, vm); case 0xb: - return new WarnUnimplemented("vmvn (reg)", machInst); + if (q) + return new NVmvnQ<uint64_t>(machInst, vd, vm); + else + return new NVmvnD<uint64_t>(machInst, vd, vm); case 0xc: + return decodeNeonSTwoMiscSReg<NVpadalD, NVpadalQ>( + q, size, machInst, vd, vm); case 0xd: - return new WarnUnimplemented("vpadal", machInst); + return decodeNeonUTwoMiscSReg<NVpadalD, NVpadalQ>( + q, size, machInst, vd, vm); case 0xe: - return new WarnUnimplemented("vqabs", machInst); + return decodeNeonSTwoMiscReg<NVqabsD, NVqabsQ>( + q, size, machInst, vd, vm); case 0xf: - return new WarnUnimplemented("vqneg", machInst); + return decodeNeonSTwoMiscReg<NVqnegD, NVqnegQ>( + q, size, machInst, vd, vm); default: return new Unknown(machInst); } case 0x1: switch (bits(b, 3, 1)) { case 0x0: - return new WarnUnimplemented("vcgt (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcgtQFp<float>(machInst, vd, vm); + } else { + return new NVcgtDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcgtD, NVcgtQ>( + q, size, machInst, vd, vm); + } case 0x1: - return new WarnUnimplemented("vcge (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcgeQFp<float>(machInst, vd, vm); + } else { + return new NVcgeDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcgeD, NVcgeQ>( + q, size, machInst, vd, vm); + } case 0x2: - return new WarnUnimplemented("vceq (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVceqQFp<float>(machInst, vd, vm); + } else { + return new NVceqDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVceqD, NVceqQ>( + q, size, machInst, vd, vm); + } case 0x3: - return new WarnUnimplemented("vcle (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcleQFp<float>(machInst, vd, vm); + } else { + return new NVcleDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcleD, NVcleQ>( + q, size, machInst, vd, vm); + } case 0x4: - return new WarnUnimplemented("vclt (imm #0)", machInst); + if (bits(b, 4)) { + if (q) { + return new NVcltQFp<float>(machInst, vd, vm); + } else { + return new NVcltDFp<float>(machInst, vd, vm); + } + } else { + return decodeNeonSTwoMiscReg<NVcltD, NVcltQ>( + q, size, machInst, vd, vm); + } case 0x6: - return new WarnUnimplemented("vabs (imm #0)", machInst); + if (bits(machInst, 10)) { + if (q) + return new NVabsQFp<float>(machInst, vd, vm); + else + return new NVabsDFp<float>(machInst, vd, vm); + } else { + return decodeNeonSTwoMiscReg<NVabsD, NVabsQ>( + q, size, machInst, vd, vm); + } case 0x7: - return new WarnUnimplemented("vneg (imm #0)", machInst); + if (bits(machInst, 10)) { + if (q) + return new NVnegQFp<float>(machInst, vd, vm); + else + return new NVnegDFp<float>(machInst, vd, vm); + } else { + return decodeNeonSTwoMiscReg<NVnegD, NVnegQ>( + q, size, machInst, vd, vm); + } } case 0x2: switch (bits(b, 4, 1)) { case 0x0: - return new WarnUnimplemented("vswp", machInst); + if (q) + return new NVswpQ<uint64_t>(machInst, vd, vm); + else + return new NVswpD<uint64_t>(machInst, vd, vm); case 0x1: - return new WarnUnimplemented("vtrn", machInst); + return decodeNeonUTwoMiscReg<NVtrnD, NVtrnQ>( + q, size, machInst, vd, vm); case 0x2: - return new WarnUnimplemented("vuzp", machInst); + return decodeNeonUTwoMiscReg<NVuzpD, NVuzpQ>( + q, size, machInst, vd, vm); case 0x3: - return new WarnUnimplemented("vzip", machInst); + return decodeNeonUTwoMiscReg<NVzipD, NVzipQ>( + q, size, machInst, vd, vm); case 0x4: if (b == 0x8) { - return new WarnUnimplemented("vmovn", machInst); + return decodeNeonUTwoMiscUSReg<NVmovn>( + size, machInst, vd, vm); } else { - return new WarnUnimplemented("vqmovun", machInst); + return decodeNeonSTwoMiscUSReg<NVqmovuns>( + size, machInst, vd, vm); } case 0x5: - return new WarnUnimplemented("vqmovn", machInst); + if (q) { + return decodeNeonUTwoMiscUSReg<NVqmovun>( + size, machInst, vd, vm); + } else { + return decodeNeonSTwoMiscUSReg<NVqmovn>( + size, machInst, vd, vm); + } case 0x6: if (b == 0xc) { - return new WarnUnimplemented("vshll", machInst); + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + unsigned size = bits(machInst, 19, 18); + return decodeNeonSTwoShiftUSReg<NVshll>( + size, machInst, vd, vm, 8 << size); } else { return new Unknown(machInst); } case 0xc: case 0xe: if (b == 0x18) { - return new WarnUnimplemented("vcvt (single to half)", - machInst); + if (size != 1 || (vm % 2)) + return new Unknown(machInst); + return new NVcvts2h<uint16_t>(machInst, vd, vm); } else if (b == 0x1c) { - return new WarnUnimplemented("vcvt (half to single)", - machInst); + if (size != 1 || (vd % 2)) + return new Unknown(machInst); + return new NVcvth2s<uint16_t>(machInst, vd, vm); } else { return new Unknown(machInst); } @@ -770,11 +1590,75 @@ let {{ } case 0x3: if (bits(b, 4, 3) == 0x3) { - return new WarnUnimplemented("vcvt (fp and int)", machInst); + if ((q && (vd % 2 || vm % 2)) || size != 2) { + return new Unknown(machInst); + } else { + if (bits(b, 2)) { + if (bits(b, 1)) { + if (q) { + return new NVcvt2ufxQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvt2ufxD<float>( + machInst, vd, vm, 0); + } + } else { + if (q) { + return new NVcvt2sfxQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvt2sfxD<float>( + machInst, vd, vm, 0); + } + } + } else { + if (bits(b, 1)) { + if (q) { + return new NVcvtu2fpQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvtu2fpD<float>( + machInst, vd, vm, 0); + } + } else { + if (q) { + return new NVcvts2fpQ<float>( + machInst, vd, vm, 0); + } else { + return new NVcvts2fpD<float>( + machInst, vd, vm, 0); + } + } + } + } } else if ((b & 0x1a) == 0x10) { - return new WarnUnimplemented("vrecpe", machInst); + if (bits(b, 2)) { + if (q) { + return new NVrecpeQFp<float>(machInst, vd, vm); + } else { + return new NVrecpeDFp<float>(machInst, vd, vm); + } + } else { + if (q) { + return new NVrecpeQ<uint32_t>(machInst, vd, vm); + } else { + return new NVrecpeD<uint32_t>(machInst, vd, vm); + } + } } else if ((b & 0x1a) == 0x12) { - return new WarnUnimplemented("vrsqrte", machInst); + if (bits(b, 2)) { + if (q) { + return new NVrsqrteQFp<float>(machInst, vd, vm); + } else { + return new NVrsqrteDFp<float>(machInst, vd, vm); + } + } else { + if (q) { + return new NVrsqrteQ<uint32_t>(machInst, vd, vm); + } else { + return new NVrsqrteD<uint32_t>(machInst, vd, vm); + } + } } else { return new Unknown(machInst); } @@ -799,29 +1683,76 @@ let {{ } } else if ((c & 0x9) == 9) { return decodeNeonTwoRegAndShift(machInst); - } else if ((c & 0x5) == 0) { - if (bits(a, 3, 2) != 0x3) { + } else if (bits(a, 2, 1) != 0x3) { + if ((c & 0x5) == 0) { return decodeNeonThreeRegDiffLengths(machInst); - } - } else if ((c & 0x5) == 4) { - if (bits(a, 3, 2) != 0x3) { + } else if ((c & 0x5) == 4) { return decodeNeonTwoRegScalar(machInst); } } else if ((a & 0x16) == 0x16) { + const IntRegIndex vd = + (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + const IntRegIndex vn = + (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + const IntRegIndex vm = + (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); if (!u) { if (bits(c, 0) == 0) { - return new WarnUnimplemented("vext", machInst); + unsigned imm4 = bits(machInst, 11, 8); + bool q = bits(machInst, 6); + if (imm4 >= 16 && !q) + return new Unknown(machInst); + if (q) { + return new NVextQ<uint8_t>(machInst, vd, vn, vm, imm4); + } else { + return new NVextD<uint8_t>(machInst, vd, vn, vm, imm4); + } } } else if (bits(b, 3) == 0 && bits(c, 0) == 0) { return decodeNeonTwoRegMisc(machInst); } else if (bits(b, 3, 2) == 0x2 && bits(c, 0) == 0) { + unsigned length = bits(machInst, 9, 8) + 1; + if ((uint32_t)vn / 2 + length > 32) + return new Unknown(machInst); if (bits(machInst, 6) == 0) { - return new WarnUnimplemented("vtbl", machInst); + switch (length) { + case 1: + return new NVtbl1(machInst, vd, vn, vm); + case 2: + return new NVtbl2(machInst, vd, vn, vm); + case 3: + return new NVtbl3(machInst, vd, vn, vm); + case 4: + return new NVtbl4(machInst, vd, vn, vm); + } } else { - return new WarnUnimplemented("vtbx", machInst); + switch (length) { + case 1: + return new NVtbx1(machInst, vd, vn, vm); + case 2: + return new NVtbx2(machInst, vd, vn, vm); + case 3: + return new NVtbx3(machInst, vd, vn, vm); + case 4: + return new NVtbx4(machInst, vd, vn, vm); + } } } else if (b == 0xc && (c & 0x9) == 0) { - return new WarnUnimplemented("vdup (scalar)", machInst); + unsigned imm4 = bits(machInst, 19, 16); + if (bits(imm4, 2, 0) == 0) + return new Unknown(machInst); + unsigned size = 0; + while ((imm4 & 0x1) == 0) { + size++; + imm4 >>= 1; + } + unsigned index = imm4 >> 1; + const bool q = bits(machInst, 6); + return decodeNeonUTwoShiftSReg<NVdupD, NVdupQ>( + q, size, machInst, vd, vm, index); } } return new Unknown(machInst); @@ -837,7 +1768,7 @@ def format ThumbNeonMem() {{ def format ThumbNeonData() {{ decode_block = ''' - return decodeNeonMem(machInst); + return decodeNeonData(machInst); ''' }}; @@ -893,7 +1824,7 @@ let {{ break; case 0x1: { - if (offset == 0 || vd + offset > NumFloatArchRegs) { + if (offset == 0 || vd + offset/2 > NumFloatArchRegs) { break; } switch (bits(opcode, 1, 0)) { @@ -1044,40 +1975,51 @@ let {{ if (bits(a, 2) == 0) { uint32_t vd = (bits(machInst, 7) << 5) | (bits(machInst, 19, 16) << 1); - uint32_t index, size; + // Handle accessing each single precision half of the vector. + vd += bits(machInst, 21); const IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 15, 12); if (bits(machInst, 22) == 1) { - size = 8; - index = (bits(machInst, 21) << 2) | - bits(machInst, 6, 5); + return new VmovCoreRegB(machInst, (IntRegIndex)vd, + rt, bits(machInst, 6, 5)); } else if (bits(machInst, 5) == 1) { - size = 16; - index = (bits(machInst, 21) << 1) | - bits(machInst, 6); + return new VmovCoreRegH(machInst, (IntRegIndex)vd, + rt, bits(machInst, 6)); } else if (bits(machInst, 6) == 0) { - size = 32; - index = bits(machInst, 21); + return new VmovCoreRegW(machInst, (IntRegIndex)vd, rt); } else { return new Unknown(machInst); } - if (index >= (32 / size)) { - index -= (32 / size); - vd++; - } - switch (size) { - case 8: - return new VmovCoreRegB(machInst, (IntRegIndex)vd, - rt, index); - case 16: - return new VmovCoreRegH(machInst, (IntRegIndex)vd, - rt, index); - case 32: - return new VmovCoreRegW(machInst, (IntRegIndex)vd, rt); - } } else if (bits(b, 1) == 0) { - // A8-594 - return new WarnUnimplemented("vdup", machInst); + bool q = bits(machInst, 21); + unsigned be = (bits(machInst, 22) << 1) | (bits(machInst, 5)); + IntRegIndex vd = (IntRegIndex)(2 * (uint32_t) + (bits(machInst, 19, 16) | (bits(machInst, 7) << 4))); + IntRegIndex rt = (IntRegIndex)(uint32_t) + bits(machInst, 15, 12); + if (q) { + switch (be) { + case 0: + return new NVdupQGpr<uint32_t>(machInst, vd, rt); + case 1: + return new NVdupQGpr<uint16_t>(machInst, vd, rt); + case 2: + return new NVdupQGpr<uint8_t>(machInst, vd, rt); + case 3: + return new Unknown(machInst); + } + } else { + switch (be) { + case 0: + return new NVdupDGpr<uint32_t>(machInst, vd, rt); + case 1: + return new NVdupDGpr<uint16_t>(machInst, vd, rt); + case 2: + return new NVdupDGpr<uint8_t>(machInst, vd, rt); + case 3: + return new Unknown(machInst); + } + } } } else if (l == 1 && c == 0) { if (a == 0) { @@ -1128,30 +2070,14 @@ let {{ } else { uint32_t vd = (bits(machInst, 7) << 5) | (bits(machInst, 19, 16) << 1); - uint32_t index, size; + // Handle indexing into each single precision half of the vector. + vd += bits(machInst, 21); + uint32_t index; const IntRegIndex rt = (IntRegIndex)(uint32_t)bits(machInst, 15, 12); const bool u = (bits(machInst, 23) == 1); if (bits(machInst, 22) == 1) { - size = 8; - index = (bits(machInst, 21) << 2) | - bits(machInst, 6, 5); - } else if (bits(machInst, 5) == 1) { - size = 16; - index = (bits(machInst, 21) << 1) | - bits(machInst, 6); - } else if (bits(machInst, 6) == 0 && !u) { - size = 32; - index = bits(machInst, 21); - } else { - return new Unknown(machInst); - } - if (index >= (32 / size)) { - index -= (32 / size); - vd++; - } - switch (size) { - case 8: + index = bits(machInst, 6, 5); if (u) { return new VmovRegCoreUB(machInst, rt, (IntRegIndex)vd, index); @@ -1159,7 +2085,8 @@ let {{ return new VmovRegCoreSB(machInst, rt, (IntRegIndex)vd, index); } - case 16: + } else if (bits(machInst, 5) == 1) { + index = bits(machInst, 6); if (u) { return new VmovRegCoreUH(machInst, rt, (IntRegIndex)vd, index); @@ -1167,8 +2094,10 @@ let {{ return new VmovRegCoreSH(machInst, rt, (IntRegIndex)vd, index); } - case 32: + } else if (bits(machInst, 6) == 0 && !u) { return new VmovRegCoreW(machInst, rt, (IntRegIndex)vd); + } else { + return new Unknown(machInst); } } return new Unknown(machInst); diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa index c4682b66c..9748c8a49 100644 --- a/src/arch/arm/isa/insts/fp.isa +++ b/src/arch/arm/isa/insts/fp.isa @@ -282,7 +282,7 @@ let {{ exec_output += PredOpExecute.subst(vmovRegQIop); vmovCoreRegBCode = ''' - FpDest.uw = insertBits(FpDest.uw, imm * 8, imm * 8 + 7, Op1.ub); + FpDest.uw = insertBits(FpDest.uw, imm * 8 + 7, imm * 8, Op1.ub); ''' vmovCoreRegBIop = InstObjParams("vmov", "VmovCoreRegB", "FpRegRegImmOp", { "code": vmovCoreRegBCode, @@ -292,7 +292,7 @@ let {{ exec_output += PredOpExecute.subst(vmovCoreRegBIop); vmovCoreRegHCode = ''' - FpDest.uw = insertBits(FpDest.uw, imm * 16, imm * 16 + 15, Op1.uh); + FpDest.uw = insertBits(FpDest.uw, imm * 16 + 15, imm * 16, Op1.uh); ''' vmovCoreRegHIop = InstObjParams("vmov", "VmovCoreRegH", "FpRegRegImmOp", { "code": vmovCoreRegHCode, @@ -312,7 +312,8 @@ let {{ exec_output += PredOpExecute.subst(vmovCoreRegWIop); vmovRegCoreUBCode = ''' - Dest = bits(FpOp1.uw, imm * 8, imm * 8 + 7); + assert(imm < 4); + Dest = bits(FpOp1.uw, imm * 8 + 7, imm * 8); ''' vmovRegCoreUBIop = InstObjParams("vmov", "VmovRegCoreUB", "FpRegRegImmOp", { "code": vmovRegCoreUBCode, @@ -322,7 +323,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreUBIop); vmovRegCoreUHCode = ''' - Dest = bits(FpOp1.uw, imm * 16, imm * 16 + 15); + assert(imm < 2); + Dest = bits(FpOp1.uw, imm * 16 + 15, imm * 16); ''' vmovRegCoreUHIop = InstObjParams("vmov", "VmovRegCoreUH", "FpRegRegImmOp", { "code": vmovRegCoreUHCode, @@ -332,7 +334,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreUHIop); vmovRegCoreSBCode = ''' - Dest = sext<8>(bits(FpOp1.uw, imm * 8, imm * 8 + 7)); + assert(imm < 4); + Dest = sext<8>(bits(FpOp1.uw, imm * 8 + 7, imm * 8)); ''' vmovRegCoreSBIop = InstObjParams("vmov", "VmovRegCoreSB", "FpRegRegImmOp", { "code": vmovRegCoreSBCode, @@ -342,7 +345,8 @@ let {{ exec_output += PredOpExecute.subst(vmovRegCoreSBIop); vmovRegCoreSHCode = ''' - Dest = sext<16>(bits(FpOp1.uw, imm * 16, imm * 16 + 15)); + assert(imm < 2); + Dest = sext<16>(bits(FpOp1.uw, imm * 16 + 15, imm * 16)); ''' vmovRegCoreSHIop = InstObjParams("vmov", "VmovRegCoreSH", "FpRegRegImmOp", { "code": vmovRegCoreSHCode, @@ -396,7 +400,7 @@ let {{ Fpscr = fpscr; ''' singleBinOp = "binaryOp(fpscr, FpOp1, FpOp2," + \ - "%(func)s, fpscr.fz, fpscr.rMode)" + "%(func)s, fpscr.fz, fpscr.dn, fpscr.rMode)" singleUnaryOp = "unaryOp(fpscr, FpOp1, %(func)s, fpscr.fz, fpscr.rMode)" doubleCode = ''' FPSCR fpscr = Fpscr; @@ -408,7 +412,7 @@ let {{ doubleBinOp = ''' binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - %(func)s, fpscr.fz, fpscr.rMode); + %(func)s, fpscr.fz, fpscr.dn, fpscr.rMode); ''' doubleUnaryOp = ''' unaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), %(func)s, @@ -499,8 +503,9 @@ let {{ vmlaSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, FpDest, mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, FpDest, mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vmlaSIop = InstObjParams("vmlas", "VmlaS", "FpRegRegRegOp", @@ -514,9 +519,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, dbl(FpDestP0.uw, FpDestP1.uw), - mid, fpAddD, fpscr.fz, fpscr.rMode); + mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -531,8 +537,9 @@ let {{ vmlsSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, FpDest, -mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, FpDest, -mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vmlsSIop = InstObjParams("vmlss", "VmlsS", "FpRegRegRegOp", @@ -546,9 +553,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, dbl(FpDestP0.uw, FpDestP1.uw), - -mid, fpAddD, fpscr.fz, fpscr.rMode); + -mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -563,8 +571,9 @@ let {{ vnmlaSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, -FpDest, -mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, -FpDest, -mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmlaSIop = InstObjParams("vnmlas", "VnmlaS", "FpRegRegRegOp", @@ -578,9 +587,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, -dbl(FpDestP0.uw, FpDestP1.uw), - -mid, fpAddD, fpscr.fz, fpscr.rMode); + -mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -595,8 +605,9 @@ let {{ vnmlsSCode = ''' FPSCR fpscr = Fpscr; float mid = binaryOp(fpscr, FpOp1, FpOp2, - fpMulS, fpscr.fz, fpscr.rMode); - FpDest = binaryOp(fpscr, -FpDest, mid, fpAddS, fpscr.fz, fpscr.rMode); + fpMulS, fpscr.fz, fpscr.dn, fpscr.rMode); + FpDest = binaryOp(fpscr, -FpDest, mid, fpAddS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmlsSIop = InstObjParams("vnmlss", "VnmlsS", "FpRegRegRegOp", @@ -610,9 +621,10 @@ let {{ FPSCR fpscr = Fpscr; double mid = binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, fpscr.rMode); double dest = binaryOp(fpscr, -dbl(FpDestP0.uw, FpDestP1.uw), - mid, fpAddD, fpscr.fz, fpscr.rMode); + mid, fpAddD, fpscr.fz, + fpscr.dn, fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -626,7 +638,8 @@ let {{ vnmulSCode = ''' FPSCR fpscr = Fpscr; - FpDest = -binaryOp(fpscr, FpOp1, FpOp2, fpMulS, fpscr.fz, fpscr.rMode); + FpDest = -binaryOp(fpscr, FpOp1, FpOp2, fpMulS, + fpscr.fz, fpscr.dn, fpscr.rMode); Fpscr = fpscr; ''' vnmulSIop = InstObjParams("vnmuls", "VnmulS", "FpRegRegRegOp", @@ -640,7 +653,8 @@ let {{ FPSCR fpscr = Fpscr; double dest = -binaryOp(fpscr, dbl(FpOp1P0.uw, FpOp1P1.uw), dbl(FpOp2P0.uw, FpOp2P1.uw), - fpMulD, fpscr.fz, fpscr.rMode); + fpMulD, fpscr.fz, fpscr.dn, + fpscr.rMode); Fpscr = fpscr; FpDestP0.uw = dblLow(dest); FpDestP1.uw = dblHi(dest); @@ -665,7 +679,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1.uw) : "m" (FpOp1.uw)); FpDest = FpOp1.uw; __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUIntFpSIop = InstObjParams("vcvt", "VcvtUIntFpS", "FpRegRegOp", @@ -681,7 +695,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1P0.uw) : "m" (FpOp1P0.uw)); double cDest = (uint64_t)FpOp1P0.uw; __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -699,7 +713,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1.sw) : "m" (FpOp1.sw)); FpDest = FpOp1.sw; __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSIntFpSIop = InstObjParams("vcvt", "VcvtSIntFpS", "FpRegRegOp", @@ -715,7 +729,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1P0.sw) : "m" (FpOp1P0.sw)); double cDest = FpOp1P0.sw; __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -734,7 +748,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, 0, false); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUIntSRIop = InstObjParams("vcvt", "VcvtFpUIntSR", "FpRegRegOp", @@ -752,7 +766,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, false, false, 0, false); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -770,7 +784,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, 0, false); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSIntSRIop = InstObjParams("vcvtr", "VcvtFpSIntSR", "FpRegRegOp", @@ -788,7 +802,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); int64_t result = vfpFpDToFixed(cOp1, true, false, 0, false); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -807,7 +821,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, 0); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUIntSIop = InstObjParams("vcvt", "VcvtFpUIntS", "FpRegRegOp", @@ -826,7 +840,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, false, false, 0); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -845,7 +859,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, 0); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSIntSIop = InstObjParams("vcvt", "VcvtFpSIntS", "FpRegRegOp", @@ -864,7 +878,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); int64_t result = vfpFpDToFixed(cOp1, true, false, 0); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; ''' @@ -882,7 +896,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); double cDest = fixFpSFpDDest(Fpscr, FpOp1); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -902,7 +916,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); FpDest = fixFpDFpSDest(Fpscr, cOp1); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpDFpSIop = InstObjParams("vcvt", "VcvtFpDFpS", "FpRegRegOp", @@ -917,9 +931,10 @@ let {{ vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); - FpDest = vcvtFpHFpS(fpscr, FpOp1, true); + FpDest = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, + bits(fpToBits(FpOp1), 31, 16)); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpHTFpSIop = InstObjParams("vcvtt", "VcvtFpHTFpS", "FpRegRegOp", @@ -933,9 +948,10 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); - FpDest = vcvtFpHFpS(fpscr, FpOp1, false); + FpDest = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, + bits(fpToBits(FpOp1), 15, 0)); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpHBFpSIop = InstObjParams("vcvtb", "VcvtFpHBFpS", "FpRegRegOp", @@ -949,11 +965,13 @@ let {{ FPSCR fpscr = Fpscr; vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); - __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest) - : "m" (FpOp1), "m" (FpDest)); - FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, true); - __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest.uw) + : "m" (FpOp1), "m" (FpDest.uw)); + FpDest.uw = insertBits(FpDest.uw, 31, 16,, + vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, + fpscr.rMode, fpscr.ahp, FpOp1)); + __asm__ __volatile__("" :: "m" (FpDest.uw)); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFpHTIop = InstObjParams("vcvtt", "VcvtFpSFpHT", "FpRegRegOp", @@ -967,11 +985,13 @@ let {{ FPSCR fpscr = Fpscr; vfpFlushToZero(fpscr, FpOp1); VfpSavedState state = prepFpState(fpscr.rMode); - __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest) - : "m" (FpOp1), "m" (FpDest)); - FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, false); - __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest.uw) + : "m" (FpOp1), "m" (FpDest.uw)); + FpDest.uw = insertBits(FpDest.uw, 15, 0, + vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, + fpscr.rMode, fpscr.ahp, FpOp1)); + __asm__ __volatile__("" :: "m" (FpDest.uw)); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFpHBIop = InstObjParams("vcvtb", "VcvtFpSFpHB", "FpRegRegOp", @@ -1201,7 +1221,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, imm); __asm__ __volatile__("" :: "m" (FpDest.sw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSFixedSIop = InstObjParams("vcvt", "VcvtFpSFixedS", "FpRegRegImmOp", @@ -1219,7 +1239,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, true, false, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1238,7 +1258,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, imm); __asm__ __volatile__("" :: "m" (FpDest.uw)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUFixedSIop = InstObjParams("vcvt", "VcvtFpUFixedS", "FpRegRegImmOp", @@ -1256,7 +1276,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, false, false, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1272,9 +1292,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.sw) : "m" (FpOp1.sw)); - FpDest = vfpSFixedToFpS(Fpscr, FpOp1.sw, false, imm); + FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.sw, false, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSFixedFpSIop = InstObjParams("vcvt", "VcvtSFixedFpS", "FpRegRegImmOp", @@ -1289,9 +1309,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpSFixedToFpD(Fpscr, mid, false, imm); + double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1307,9 +1327,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.uw) : "m" (FpOp1.uw)); - FpDest = vfpUFixedToFpS(Fpscr, FpOp1.uw, false, imm); + FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.uw, false, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUFixedFpSIop = InstObjParams("vcvt", "VcvtUFixedFpS", "FpRegRegImmOp", @@ -1324,9 +1344,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpUFixedToFpD(Fpscr, mid, false, imm); + double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, false, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1345,7 +1365,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sh = vfpFpSToFixed(FpOp1, true, true, imm); __asm__ __volatile__("" :: "m" (FpDest.sh)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpSHFixedSIop = InstObjParams("vcvt", "VcvtFpSHFixedS", @@ -1364,7 +1384,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t result = vfpFpDToFixed(cOp1, true, true, imm); __asm__ __volatile__("" :: "m" (result)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = result; FpDestP1.uw = result >> 32; @@ -1384,7 +1404,7 @@ let {{ __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uh = vfpFpSToFixed(FpOp1, false, true, imm); __asm__ __volatile__("" :: "m" (FpDest.uh)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtFpUHFixedSIop = InstObjParams("vcvt", "VcvtFpUHFixedS", @@ -1403,7 +1423,7 @@ let {{ __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); uint64_t mid = vfpFpDToFixed(cOp1, false, true, imm); __asm__ __volatile__("" :: "m" (mid)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = mid; FpDestP1.uw = mid >> 32; @@ -1420,9 +1440,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.sh) : "m" (FpOp1.sh)); - FpDest = vfpSFixedToFpS(Fpscr, FpOp1.sh, true, imm); + FpDest = vfpSFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.sh, true, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtSHFixedFpSIop = InstObjParams("vcvt", "VcvtSHFixedFpS", @@ -1438,9 +1458,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpSFixedToFpD(Fpscr, mid, true, imm); + double cDest = vfpSFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); @@ -1457,9 +1477,9 @@ let {{ FPSCR fpscr = Fpscr; VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (FpOp1.uh) : "m" (FpOp1.uh)); - FpDest = vfpUFixedToFpS(Fpscr, FpOp1.uh, true, imm); + FpDest = vfpUFixedToFpS(fpscr.fz, fpscr.dn, FpOp1.uh, true, imm); __asm__ __volatile__("" :: "m" (FpDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; ''' vcvtUHFixedFpSIop = InstObjParams("vcvt", "VcvtUHFixedFpS", @@ -1475,9 +1495,9 @@ let {{ uint64_t mid = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); VfpSavedState state = prepFpState(fpscr.rMode); __asm__ __volatile__("" : "=m" (mid) : "m" (mid)); - double cDest = vfpUFixedToFpD(Fpscr, mid, true, imm); + double cDest = vfpUFixedToFpD(fpscr.fz, fpscr.dn, mid, true, imm); __asm__ __volatile__("" :: "m" (cDest)); - finishVfp(fpscr, state); + finishVfp(fpscr, state, fpscr.fz); Fpscr = fpscr; FpDestP0.uw = dblLow(cDest); FpDestP1.uw = dblHi(cDest); diff --git a/src/arch/arm/isa/insts/insts.isa b/src/arch/arm/isa/insts/insts.isa index a79557f3d..9c51f3cf0 100644 --- a/src/arch/arm/isa/insts/insts.isa +++ b/src/arch/arm/isa/insts/insts.isa @@ -70,5 +70,8 @@ //Divide ##include "div.isa" -//FP (VFP and Neon) +//VFP ##include "fp.isa" + +//Neon +##include "neon.isa" diff --git a/src/arch/arm/isa/insts/macromem.isa b/src/arch/arm/isa/insts/macromem.isa index ca2c7c6ab..652a929f1 100644 --- a/src/arch/arm/isa/insts/macromem.isa +++ b/src/arch/arm/isa/insts/macromem.isa @@ -57,11 +57,34 @@ let {{ microLdrFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" microLdrFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrFpUop', - 'MicroMemOp', - {'memacc_code': microLdrFpUopCode, - 'ea_code': 'EA = Rb + (up ? imm : -imm);', - 'predicate_test': predicateTest}, - ['IsMicroop']) + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': + 'EA = Rb + (up ? imm : -imm);', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microLdrDBFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" + microLdrDBFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrDBFpUop', + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) + + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microLdrDTFpUopCode = "Fa.uw = cSwap(Mem.uw, ((CPSR)Cpsr).e);" + microLdrDTFpUopIop = InstObjParams('ldrfp_uop', 'MicroLdrDTFpUop', + 'MicroMemOp', + {'memacc_code': microLdrFpUopCode, + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) - + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) microLdrRetUopCode = ''' CPSR cpsr = Cpsr; @@ -98,10 +121,36 @@ let {{ 'predicate_test': predicateTest}, ['IsMicroop']) + microStrDBFpUopCode = "Mem = cSwap(Fa.uw, ((CPSR)Cpsr).e);" + microStrDBFpUopIop = InstObjParams('strfp_uop', 'MicroStrDBFpUop', + 'MicroMemOp', + {'memacc_code': microStrFpUopCode, + 'postacc_code': "", + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) + + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + + microStrDTFpUopCode = "Mem = cSwap(Fa.uw, ((CPSR)Cpsr).e);" + microStrDTFpUopIop = InstObjParams('strfp_uop', 'MicroStrDTFpUop', + 'MicroMemOp', + {'memacc_code': microStrFpUopCode, + 'postacc_code': "", + 'ea_code': ''' + EA = Rb + (up ? imm : -imm) - + (((CPSR)Cpsr).e ? 4 : 0); + ''', + 'predicate_test': predicateTest}, + ['IsMicroop']) + header_output = decoder_output = exec_output = '' - loadIops = (microLdrUopIop, microLdrFpUopIop, microLdrRetUopIop) - storeIops = (microStrUopIop, microStrFpUopIop) + loadIops = (microLdrUopIop, microLdrRetUopIop, + microLdrFpUopIop, microLdrDBFpUopIop, microLdrDTFpUopIop) + storeIops = (microStrUopIop, microStrFpUopIop, + microStrDBFpUopIop, microStrDTFpUopIop) for iop in loadIops + storeIops: header_output += MicroMemDeclare.subst(iop) decoder_output += MicroMemConstructor.subst(iop) @@ -115,6 +164,403 @@ let {{ StoreCompleteAcc.subst(iop) }}; +let {{ + exec_output = header_output = '' + + eaCode = 'EA = Ra + imm;' + + for size in (1, 2, 3, 4, 6, 8, 12, 16): + # Set up the memory access. + regs = (size + 3) // 4 + subst = { "size" : size, "regs" : regs } + memDecl = ''' + union MemUnion { + uint8_t bytes[%(size)d]; + Element elements[%(size)d / sizeof(Element)]; + uint32_t floatRegBits[%(regs)d]; + }; + ''' % subst + + # Do endian conversion for all the elements. + convCode = ''' + const unsigned eCount = sizeof(memUnion.elements) / + sizeof(memUnion.elements[0]); + if (((CPSR)Cpsr).e) { + for (unsigned i = 0; i < eCount; i++) { + memUnion.elements[i] = gtobe(memUnion.elements[i]); + } + } else { + for (unsigned i = 0; i < eCount; i++) { + memUnion.elements[i] = gtole(memUnion.elements[i]); + } + } + ''' + + # Offload everything into registers + regSetCode = '' + for reg in range(regs): + mask = '' + if reg == regs - 1: + mask = ' & mask(%d)' % (32 - 8 * (regs * 4 - size)) + regSetCode += ''' + FpDestP%(reg)d.uw = gtoh(memUnion.floatRegBits[%(reg)d])%(mask)s; + ''' % { "reg" : reg, "mask" : mask } + + # Pull everything in from registers + regGetCode = '' + for reg in range(regs): + regGetCode += ''' + memUnion.floatRegBits[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + + loadMemAccCode = convCode + regSetCode + storeMemAccCode = regGetCode + convCode + + loadIop = InstObjParams('ldrneon%(size)d_uop' % subst, + 'MicroLdrNeon%(size)dUop' % subst, + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'size' : size, + 'memacc_code' : loadMemAccCode, + 'ea_code' : eaCode, + 'predicate_test' : predicateTest }, + [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) + storeIop = InstObjParams('strneon%(size)d_uop' % subst, + 'MicroStrNeon%(size)dUop' % subst, + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'size' : size, + 'memacc_code' : storeMemAccCode, + 'ea_code' : eaCode, + 'predicate_test' : predicateTest }, + [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) + + exec_output += NeonLoadExecute.subst(loadIop) + \ + NeonLoadInitiateAcc.subst(loadIop) + \ + NeonLoadCompleteAcc.subst(loadIop) + \ + NeonStoreExecute.subst(storeIop) + \ + NeonStoreInitiateAcc.subst(storeIop) + \ + NeonStoreCompleteAcc.subst(storeIop) + header_output += MicroNeonMemDeclare.subst(loadIop) + \ + MicroNeonMemDeclare.subst(storeIop) +}}; + +let {{ + exec_output = '' + for eSize, type in (1, 'uint8_t'), \ + (2, 'uint16_t'), \ + (4, 'uint32_t'), \ + (8, 'uint64_t'): + size = eSize + # An instruction handles no more than 16 bytes and no more than + # 4 elements, or the number of elements needed to fill 8 or 16 bytes. + sizes = set((16, 8)) + for count in 1, 2, 3, 4: + size = count * eSize + if size <= 16: + sizes.add(size) + for size in sizes: + substDict = { + "class_name" : "MicroLdrNeon%dUop" % size, + "targs" : type + } + exec_output += MicroNeonMemExecDeclare.subst(substDict) + substDict["class_name"] = "MicroStrNeon%dUop" % size + exec_output += MicroNeonMemExecDeclare.subst(substDict) + size += eSize +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (de)interlacing microops +// + +let {{ + header_output = exec_output = '' + for dRegs in (2, 3, 4): + loadConv = '' + unloadConv = '' + for dReg in range(dRegs): + loadConv += ''' + conv1.cRegs[%(sReg0)d] = htog(FpOp1P%(sReg0)d.uw); + conv1.cRegs[%(sReg1)d] = htog(FpOp1P%(sReg1)d.uw); + ''' % { "sReg0" : (dReg * 2), "sReg1" : (dReg * 2 + 1) } + unloadConv += ''' + FpDestS%(dReg)dP0.uw = gtoh(conv2.cRegs[2 * %(dReg)d + 0]); + FpDestS%(dReg)dP1.uw = gtoh(conv2.cRegs[2 * %(dReg)d + 1]); + ''' % { "dReg" : dReg } + microDeintNeonCode = ''' + const unsigned dRegs = %(dRegs)d; + const unsigned regs = 2 * dRegs; + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + union convStruct { + FloatRegBits cRegs[regs]; + Element elements[dRegs * perDReg]; + } conv1, conv2; + + %(loadConv)s + + unsigned srcElem = 0; + for (unsigned destOffset = 0; + destOffset < perDReg; destOffset++) { + for (unsigned dReg = 0; dReg < dRegs; dReg++) { + conv2.elements[dReg * perDReg + destOffset] = + conv1.elements[srcElem++]; + } + } + + %(unloadConv)s + ''' % { "dRegs" : dRegs, + "loadConv" : loadConv, + "unloadConv" : unloadConv } + microDeintNeonIop = \ + InstObjParams('deintneon%duop' % (dRegs * 2), + 'MicroDeintNeon%dUop' % (dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microDeintNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microDeintNeonIop) + exec_output += MicroNeonMixExecute.subst(microDeintNeonIop) + + loadConv = '' + unloadConv = '' + for dReg in range(dRegs): + loadConv += ''' + conv1.cRegs[2 * %(dReg)d + 0] = htog(FpOp1S%(dReg)dP0.uw); + conv1.cRegs[2 * %(dReg)d + 1] = htog(FpOp1S%(dReg)dP1.uw); + ''' % { "dReg" : dReg } + unloadConv += ''' + FpDestP%(sReg0)d.uw = gtoh(conv2.cRegs[%(sReg0)d]); + FpDestP%(sReg1)d.uw = gtoh(conv2.cRegs[%(sReg1)d]); + ''' % { "sReg0" : (dReg * 2), "sReg1" : (dReg * 2 + 1) } + microInterNeonCode = ''' + const unsigned dRegs = %(dRegs)d; + const unsigned regs = 2 * dRegs; + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + union convStruct { + FloatRegBits cRegs[regs]; + Element elements[dRegs * perDReg]; + } conv1, conv2; + + %(loadConv)s + + unsigned destElem = 0; + for (unsigned srcOffset = 0; + srcOffset < perDReg; srcOffset++) { + for (unsigned dReg = 0; dReg < dRegs; dReg++) { + conv2.elements[destElem++] = + conv1.elements[dReg * perDReg + srcOffset]; + } + } + + %(unloadConv)s + ''' % { "dRegs" : dRegs, + "loadConv" : loadConv, + "unloadConv" : unloadConv } + microInterNeonIop = \ + InstObjParams('interneon%duop' % (dRegs * 2), + 'MicroInterNeon%dUop' % (dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microInterNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microInterNeonIop) + exec_output += MicroNeonMixExecute.subst(microInterNeonIop) +}}; + +let {{ + exec_output = '' + for type in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'): + for dRegs in (2, 3, 4): + Name = "MicroDeintNeon%dUop" % (dRegs * 2) + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) + Name = "MicroInterNeon%dUop" % (dRegs * 2) + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon microops to pack/unpack a single lane +// + +let {{ + header_output = exec_output = '' + for sRegs in 1, 2: + baseLoadRegs = '' + for reg in range(sRegs): + baseLoadRegs += ''' + sourceRegs.fRegs[%(reg0)d] = htog(FpOp1P%(reg0)d.uw); + sourceRegs.fRegs[%(reg1)d] = htog(FpOp1P%(reg1)d.uw); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for dRegs in range(sRegs, 5): + unloadRegs = '' + loadRegs = baseLoadRegs + for reg in range(dRegs): + loadRegs += ''' + destRegs[%(reg)d].fRegs[0] = htog(FpDestS%(reg)dP0.uw); + destRegs[%(reg)d].fRegs[1] = htog(FpDestS%(reg)dP1.uw); + ''' % { "reg" : reg } + unloadRegs += ''' + FpDestS%(reg)dP0.uw = gtoh(destRegs[%(reg)d].fRegs[0]); + FpDestS%(reg)dP1.uw = gtoh(destRegs[%(reg)d].fRegs[1]); + ''' % { "reg" : reg } + microUnpackNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceRegs { + FloatRegBits fRegs[2 * %(sRegs)d]; + Element elements[%(sRegs)d * perDReg]; + } sourceRegs; + + union DestReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } destRegs[%(dRegs)d]; + + %(loadRegs)s + + for (unsigned i = 0; i < %(dRegs)d; i++) { + destRegs[i].elements[lane] = sourceRegs.elements[i]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microUnpackNeonIop = \ + InstObjParams('unpackneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroUnpackNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixLaneOp', + { 'predicate_test': predicateTest, + 'code' : microUnpackNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare.subst(microUnpackNeonIop) + exec_output += MicroNeonMixExecute.subst(microUnpackNeonIop) + + for sRegs in 1, 2: + loadRegs = '' + for reg in range(sRegs): + loadRegs += ''' + sourceRegs.fRegs[%(reg0)d] = htog(FpOp1P%(reg0)d.uw); + sourceRegs.fRegs[%(reg1)d] = htog(FpOp1P%(reg1)d.uw); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for dRegs in range(sRegs, 5): + unloadRegs = '' + for reg in range(dRegs): + unloadRegs += ''' + FpDestS%(reg)dP0.uw = gtoh(destRegs[%(reg)d].fRegs[0]); + FpDestS%(reg)dP1.uw = gtoh(destRegs[%(reg)d].fRegs[1]); + ''' % { "reg" : reg } + microUnpackAllNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceRegs { + FloatRegBits fRegs[2 * %(sRegs)d]; + Element elements[%(sRegs)d * perDReg]; + } sourceRegs; + + union DestReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } destRegs[%(dRegs)d]; + + %(loadRegs)s + + for (unsigned i = 0; i < %(dRegs)d; i++) { + for (unsigned j = 0; j < perDReg; j++) + destRegs[i].elements[j] = sourceRegs.elements[i]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microUnpackAllNeonIop = \ + InstObjParams('unpackallneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroUnpackAllNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixOp', + { 'predicate_test': predicateTest, + 'code' : microUnpackAllNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixDeclare.subst(microUnpackAllNeonIop) + exec_output += MicroNeonMixExecute.subst(microUnpackAllNeonIop) + + for dRegs in 1, 2: + unloadRegs = '' + for reg in range(dRegs): + unloadRegs += ''' + FpDestP%(reg0)d.uw = gtoh(destRegs.fRegs[%(reg0)d]); + FpDestP%(reg1)d.uw = gtoh(destRegs.fRegs[%(reg1)d]); + ''' % { "reg0" : (2 * reg + 0), + "reg1" : (2 * reg + 1) } + for sRegs in range(dRegs, 5): + loadRegs = '' + for reg in range(sRegs): + loadRegs += ''' + sourceRegs[%(reg)d].fRegs[0] = htog(FpOp1S%(reg)dP0.uw); + sourceRegs[%(reg)d].fRegs[1] = htog(FpOp1S%(reg)dP1.uw); + ''' % { "reg" : reg } + microPackNeonCode = ''' + const unsigned perDReg = (2 * sizeof(FloatRegBits)) / + sizeof(Element); + + union SourceReg { + FloatRegBits fRegs[2]; + Element elements[perDReg]; + } sourceRegs[%(sRegs)d]; + + union DestRegs { + FloatRegBits fRegs[2 * %(dRegs)d]; + Element elements[%(dRegs)d * perDReg]; + } destRegs; + + %(loadRegs)s + + for (unsigned i = 0; i < %(sRegs)d; i++) { + destRegs.elements[i] = sourceRegs[i].elements[lane]; + } + + %(unloadRegs)s + ''' % { "sRegs" : sRegs, "dRegs" : dRegs, + "loadRegs" : loadRegs, "unloadRegs" : unloadRegs } + + microPackNeonIop = \ + InstObjParams('packneon%dto%duop' % (sRegs * 2, dRegs * 2), + 'MicroPackNeon%dto%dUop' % + (sRegs * 2, dRegs * 2), + 'MicroNeonMixLaneOp', + { 'predicate_test': predicateTest, + 'code' : microPackNeonCode }, + ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare.subst(microPackNeonIop) + exec_output += MicroNeonMixExecute.subst(microPackNeonIop) +}}; + +let {{ + exec_output = '' + for type in ('uint8_t', 'uint16_t', 'uint32_t'): + for sRegs in 1, 2: + for dRegs in range(sRegs, 5): + for format in ("MicroUnpackNeon%(sRegs)dto%(dRegs)dUop", + "MicroUnpackAllNeon%(sRegs)dto%(dRegs)dUop", + "MicroPackNeon%(dRegs)dto%(sRegs)dUop"): + Name = format % { "sRegs" : sRegs * 2, + "dRegs" : dRegs * 2 } + substDict = { "class_name" : Name, "targs" : type } + exec_output += MicroNeonExecDeclare.subst(substDict) +}}; + //////////////////////////////////////////////////////////////////// // // Integer = Integer op Immediate microops @@ -122,23 +568,32 @@ let {{ let {{ microAddiUopIop = InstObjParams('addi_uop', 'MicroAddiUop', - 'MicroIntOp', + 'MicroIntImmOp', {'code': 'Ra = Rb + imm;', 'predicate_test': predicateTest}, ['IsMicroop']) + microAddUopIop = InstObjParams('add_uop', 'MicroAddUop', + 'MicroIntOp', + {'code': 'Ra = Rb + Rc;', + 'predicate_test': predicateTest}, + ['IsMicroop']) + microSubiUopIop = InstObjParams('subi_uop', 'MicroSubiUop', - 'MicroIntOp', + 'MicroIntImmOp', {'code': 'Ra = Rb - imm;', 'predicate_test': predicateTest}, ['IsMicroop']) - header_output = MicroIntDeclare.subst(microAddiUopIop) + \ - MicroIntDeclare.subst(microSubiUopIop) - decoder_output = MicroIntConstructor.subst(microAddiUopIop) + \ - MicroIntConstructor.subst(microSubiUopIop) + header_output = MicroIntImmDeclare.subst(microAddiUopIop) + \ + MicroIntImmDeclare.subst(microSubiUopIop) + \ + MicroIntDeclare.subst(microAddUopIop) + decoder_output = MicroIntImmConstructor.subst(microAddiUopIop) + \ + MicroIntImmConstructor.subst(microSubiUopIop) + \ + MicroIntConstructor.subst(microAddUopIop) exec_output = PredOpExecute.subst(microAddiUopIop) + \ - PredOpExecute.subst(microSubiUopIop) + PredOpExecute.subst(microSubiUopIop) + \ + PredOpExecute.subst(microAddUopIop) }}; let {{ @@ -146,6 +601,22 @@ let {{ header_output = MacroMemDeclare.subst(iop) decoder_output = MacroMemConstructor.subst(iop) + iop = InstObjParams("vldmult", "VldMult", 'VldMultOp', "", []) + header_output += VMemMultDeclare.subst(iop) + decoder_output += VMemMultConstructor.subst(iop) + + iop = InstObjParams("vldsingle", "VldSingle", 'VldSingleOp', "", []) + header_output += VMemSingleDeclare.subst(iop) + decoder_output += VMemSingleConstructor.subst(iop) + + iop = InstObjParams("vstmult", "VstMult", 'VstMultOp', "", []) + header_output += VMemMultDeclare.subst(iop) + decoder_output += VMemMultConstructor.subst(iop) + + iop = InstObjParams("vstsingle", "VstSingle", 'VstSingleOp', "", []) + header_output += VMemSingleDeclare.subst(iop) + decoder_output += VMemSingleConstructor.subst(iop) + vfpIop = InstObjParams("vldmstm", "VLdmStm", 'MacroVFPMemOp', "", []) header_output += MacroVFPMemDeclare.subst(vfpIop) decoder_output += MacroVFPMemConstructor.subst(vfpIop) diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa new file mode 100644 index 000000000..b629c6fe8 --- /dev/null +++ b/src/arch/arm/isa/insts/neon.isa @@ -0,0 +1,3343 @@ +// -*- mode:c++ -*- + +// Copyright (c) 2010 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Gabe Black + +output header {{ + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUThreeUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, op2); + case 1: + return new Base<uint16_t>(machInst, dest, op1, op2); + case 2: + return new Base<uint32_t>(machInst, dest, op1, op2); + case 3: + return new Base<uint64_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSThreeUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, op2); + case 1: + return new Base<int16_t>(machInst, dest, op1, op2); + case 2: + return new Base<int32_t>(machInst, dest, op1, op2); + case 3: + return new Base<int64_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUSThreeUReg(bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeUReg<Base>(size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUReg<Base>(size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUThreeUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, op2); + case 1: + return new Base<uint16_t>(machInst, dest, op1, op2); + case 2: + return new Base<uint32_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSThreeUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, op2); + case 1: + return new Base<int16_t>(machInst, dest, op1, op2); + case 2: + return new Base<int32_t>(machInst, dest, op1, op2); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUSThreeUSReg(bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeUSReg<Base>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUSReg<Base>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUThreeSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonUThreeUSReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonUThreeUSReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSThreeSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonSThreeUSReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUSReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSThreeSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUThreeReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonUThreeUReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonUThreeUReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSThreeReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (q) { + return decodeNeonSThreeUReg<BaseQ>( + size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeUReg<BaseD>( + size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSThreeReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, IntRegIndex op2) + { + if (notSigned) { + return decodeNeonUThreeReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } else { + return decodeNeonSThreeReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, op2); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoShiftReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + switch (size) { + case 0: + return new BaseQ<uint8_t>(machInst, dest, op1, imm); + case 1: + return new BaseQ<uint16_t>(machInst, dest, op1, imm); + case 2: + return new BaseQ<uint32_t>(machInst, dest, op1, imm); + case 3: + return new BaseQ<uint64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 0: + return new BaseD<uint8_t>(machInst, dest, op1, imm); + case 1: + return new BaseD<uint16_t>(machInst, dest, op1, imm); + case 2: + return new BaseD<uint32_t>(machInst, dest, op1, imm); + case 3: + return new BaseD<uint64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoShiftReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + switch (size) { + case 0: + return new BaseQ<int8_t>(machInst, dest, op1, imm); + case 1: + return new BaseQ<int16_t>(machInst, dest, op1, imm); + case 2: + return new BaseQ<int32_t>(machInst, dest, op1, imm); + case 3: + return new BaseQ<int64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } else { + switch (size) { + case 0: + return new BaseD<int8_t>(machInst, dest, op1, imm); + case 1: + return new BaseD<int16_t>(machInst, dest, op1, imm); + case 2: + return new BaseD<int32_t>(machInst, dest, op1, imm); + case 3: + return new BaseD<int64_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + } + + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoShiftReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (notSigned) { + return decodeNeonUTwoShiftReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoShiftUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1, imm); + case 1: + return new Base<uint16_t>(machInst, dest, op1, imm); + case 2: + return new Base<uint32_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoShiftSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + return decodeNeonUTwoShiftUSReg<BaseQ>( + size, machInst, dest, op1, imm); + } else { + return decodeNeonUTwoShiftUSReg<BaseD>( + size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoShiftUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1, imm); + case 1: + return new Base<int16_t>(machInst, dest, op1, imm); + case 2: + return new Base<int32_t>(machInst, dest, op1, imm); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoShiftSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (q) { + return decodeNeonSTwoShiftUSReg<BaseQ>( + size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftUSReg<BaseD>( + size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoShiftSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1, uint64_t imm) + { + if (notSigned) { + return decodeNeonUTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } else { + return decodeNeonSTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1, imm); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoMiscUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1); + case 1: + return new Base<uint16_t>(machInst, dest, op1); + case 2: + return new Base<uint32_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoMiscUSReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1); + case 1: + return new Base<int16_t>(machInst, dest, op1); + case 2: + return new Base<int32_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoMiscSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonUTwoMiscUSReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonUTwoMiscUSReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoMiscSReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonSTwoMiscUSReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonSTwoMiscUSReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonUTwoMiscUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<uint8_t>(machInst, dest, op1); + case 1: + return new Base<uint16_t>(machInst, dest, op1); + case 2: + return new Base<uint32_t>(machInst, dest, op1); + case 3: + return new Base<uint64_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class Base> + StaticInstPtr + decodeNeonSTwoMiscUReg(unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + switch (size) { + case 0: + return new Base<int8_t>(machInst, dest, op1); + case 1: + return new Base<int16_t>(machInst, dest, op1); + case 2: + return new Base<int32_t>(machInst, dest, op1); + case 3: + return new Base<int64_t>(machInst, dest, op1); + default: + return new Unknown(machInst); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonSTwoMiscReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonSTwoMiscUReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonSTwoMiscUReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUTwoMiscReg(bool q, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (q) { + return decodeNeonUTwoMiscUReg<BaseQ>(size, machInst, dest, op1); + } else { + return decodeNeonUTwoMiscUReg<BaseD>(size, machInst, dest, op1); + } + } + + template <template <typename T> class BaseD, + template <typename T> class BaseQ> + StaticInstPtr + decodeNeonUSTwoMiscSReg(bool q, bool notSigned, unsigned size, + ExtMachInst machInst, IntRegIndex dest, + IntRegIndex op1) + { + if (notSigned) { + return decodeNeonUTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1); + } else { + return decodeNeonSTwoShiftSReg<BaseD, BaseQ>( + q, size, machInst, dest, op1); + } + } + +}}; + +output exec {{ + static float + vcgtFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 > op2) ? 0.0 : 1.0; + } + + static float + vcgeFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 >= op2) ? 0.0 : 1.0; + } + + static float + vceqFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 == op2) ? 0.0 : 1.0; + } + + static float + vcleFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 <= op2) ? 0.0 : 1.0; + } + + static float + vcltFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (op1 < op2) ? 0.0 : 1.0; + } + + static float + vacgtFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (fabsf(op1) > fabsf(op2)) ? 0.0 : 1.0; + } + + static float + vacgeFunc(float op1, float op2) + { + if (isSnan(op1) || isSnan(op2)) + return 2.0; + return (fabsf(op1) >= fabsf(op2)) ? 0.0 : 1.0; + } +}}; + +let {{ + + header_output = "" + exec_output = "" + + smallUnsignedTypes = ("uint8_t", "uint16_t", "uint32_t") + unsignedTypes = smallUnsignedTypes + ("uint64_t",) + smallSignedTypes = ("int8_t", "int16_t", "int32_t") + signedTypes = smallSignedTypes + ("int64_t",) + smallTypes = smallUnsignedTypes + smallSignedTypes + allTypes = unsignedTypes + signedTypes + + def threeEqualRegInst(name, Name, types, rCount, op, + readDest=False, pairwise=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + if pairwise: + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i] : + srcReg2.elements[2 * i - eCount]); + Element srcElem2 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i + 1] : + srcReg2.elements[2 * i + 1 - eCount]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + else: + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeEqualRegInstFp(name, Name, types, rCount, op, + readDest=False, pairwise=False, toInt=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1, srcRegs2; + ''' + if toInt: + eWalkCode += 'RegVect destRegs;\n' + else: + eWalkCode += 'FloatVect destRegs;\n' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + srcRegs2[%(reg)d] = FpOp2P%(reg)d; + ''' % { "reg" : reg } + if readDest: + if toInt: + eWalkCode += ''' + destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[r];' + destType = 'FloatReg' + writeDest = 'destRegs[r] = destReg;' + if toInt: + destType = 'FloatRegBits' + writeDest = 'destRegs.regs[r] = destReg;' + if pairwise: + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = (2 * r < rCount) ? + srcRegs1[2 * r] : srcRegs2[2 * r - rCount]; + FloatReg srcReg2 = (2 * r < rCount) ? + srcRegs1[2 * r + 1] : srcRegs2[2 * r + 1 - rCount]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + else: + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = srcRegs1[r]; + FloatReg srcReg2 = srcRegs2[r]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + for reg in range(rCount): + if toInt: + eWalkCode += ''' + FpDestP%(reg)d.uw = destRegs.regs[%(reg)d]; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeUnequalRegInst(name, Name, types, op, + bigSrc1, bigSrc2, bigDest, readDest): + global header_output, exec_output + src1Cnt = src2Cnt = destCnt = 2 + src1Prefix = src2Prefix = destPrefix = '' + if bigSrc1: + src1Cnt = 4 + src1Prefix = 'Big' + if bigSrc2: + src2Cnt = 4 + src2Prefix = 'Big' + if bigDest: + destCnt = 4 + destPrefix = 'Big' + eWalkCode = ''' + %sRegVect srcReg1; + %sRegVect srcReg2; + %sRegVect destReg; + ''' % (src1Prefix, src2Prefix, destPrefix) + for reg in range(src1Cnt): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + for reg in range(src2Cnt): + eWalkCode += ''' + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(destCnt): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]); + %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[i]); + %(destPrefix)sElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix, + "destPrefix" : destPrefix } + for reg in range(destCnt): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def threeRegNarrowInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + True, True, False, readDest) + + def threeRegLongInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + False, False, True, readDest) + + def threeRegWideInst(name, Name, types, op, readDest=False): + threeUnequalRegInst(name, Name, types, op, + True, False, True, readDest) + + def twoEqualRegInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + assert(imm >= 0 && imm < eCount); + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[imm]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongInst(name, Name, types, op, readDest=False): + global header_output, exec_output + rCount = 2 + eWalkCode = ''' + RegVect srcReg1, srcReg2; + BigRegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw);; + ''' % { "reg" : reg } + if readDest: + for reg in range(2 * rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + assert(imm >= 0 && imm < eCount); + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[imm]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2 * rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoEqualRegInstFp(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1, srcRegs2, destRegs; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + srcRegs2[%(reg)d] = FpOp2P%(reg)d; + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[i];' + eWalkCode += ''' + assert(imm >= 0 && imm < rCount); + for (unsigned i = 0; i < rCount; i++) { + FloatReg srcReg1 = srcRegs1[i]; + FloatReg srcReg2 = srcRegs2[imm]; + FloatReg destReg; + %(readDest)s + %(op)s + destRegs[i] = destReg; + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegShiftInst(name, Name, types, rCount, op, + readDest=False, toInt=False, fromInt=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcRegs1, destRegs; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destRegs.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destRegs.elements[i]);' + if toInt: + readDestCode = 'destReg = gtoh(destRegs.regs[i]);' + readOpCode = 'Element srcElem1 = gtoh(srcRegs1.elements[i]);' + if fromInt: + readOpCode = 'FloatRegBits srcReg1 = gtoh(srcRegs1.regs[i]);' + declDest = 'Element destElem;' + writeDestCode = 'destRegs.elements[i] = htog(destElem);' + if toInt: + declDest = 'FloatRegBits destReg;' + writeDestCode = 'destRegs.regs[i] = htog(destReg);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(readOp)s + %(declDest)s + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "readOp" : readOpCode, + "declDest" : declDest, + "readDest" : readDestCode, + "op" : op, + "writeDest" : writeDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destRegs.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegNarrowShiftInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + BigRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(2): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + BigElement srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongShiftInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1; + BigRegVect destReg; + ''' + for reg in range(2): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(4): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(4): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + unsigned j = i; + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[j] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscScInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[imm]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscScramble(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + FpOp1P%(reg)d.uw = gtoh(srcReg1.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegMiscInstFp(name, Name, types, rCount, op, + readDest=False, toInt=False): + global header_output, exec_output + eWalkCode = ''' + typedef FloatReg FloatVect[rCount]; + FloatVect srcRegs1; + ''' + if toInt: + eWalkCode += 'RegVect destRegs;\n' + else: + eWalkCode += 'FloatVect destRegs;\n' + for reg in range(rCount): + eWalkCode += ''' + srcRegs1[%(reg)d] = FpOp1P%(reg)d; + ''' % { "reg" : reg } + if readDest: + if toInt: + eWalkCode += ''' + destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + destRegs[%(reg)d] = FpDestP%(reg)d; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = destRegs[i];' + destType = 'FloatReg' + writeDest = 'destRegs[r] = destReg;' + if toInt: + destType = 'FloatRegBits' + writeDest = 'destRegs.regs[r] = destReg;' + eWalkCode += ''' + for (unsigned r = 0; r < rCount; r++) { + FloatReg srcReg1 = srcRegs1[r]; + %(destType)s destReg; + %(readDest)s + %(op)s + %(writeDest)s + } + ''' % { "op" : op, + "readDest" : readDestCode, + "destType" : destType, + "writeDest" : writeDest } + for reg in range(rCount): + if toInt: + eWalkCode += ''' + FpDestP%(reg)d.uw = destRegs.regs[%(reg)d]; + ''' % { "reg" : reg } + else: + eWalkCode += ''' + FpDestP%(reg)d = destRegs[%(reg)d]; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "FpRegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegCondenseInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcRegs; + BigRegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount / 2; i++) { + Element srcElem1 = gtoh(srcRegs.elements[2 * i]); + Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegNarrowMiscInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + BigRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(2): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + BigElement srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(2): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def oneRegImmInst(name, Name, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect destReg; + ''' + if readDest: + for reg in range(rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + def twoRegLongMiscInst(name, Name, types, op, readDest=False): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1; + BigRegVect destReg; + ''' + for reg in range(2): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(4): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(FpDestP%(reg)d.uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destReg = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(4): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": 2, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonUnequalRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + vhaddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1)) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInst("vhadd", "VhaddD", allTypes, 2, vhaddCode) + threeEqualRegInst("vhadd", "VhaddQ", allTypes, 4, vhaddCode) + + vrhaddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1) + 1) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInst("vrhadd", "VrhaddD", allTypes, 2, vrhaddCode) + threeEqualRegInst("vrhadd", "VrhaddQ", allTypes, 4, vrhaddCode) + + vhsubCode = ''' + Element barrowBit = + (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) - + ((srcElem2 & ~(Element)1) / 2)) - barrowBit; + ''' + threeEqualRegInst("vhsub", "VhsubD", allTypes, 2, vhsubCode) + threeEqualRegInst("vhsub", "VhsubQ", allTypes, 4, vhsubCode) + + vandCode = ''' + destElem = srcElem1 & srcElem2; + ''' + threeEqualRegInst("vand", "VandD", unsignedTypes, 2, vandCode) + threeEqualRegInst("vand", "VandQ", unsignedTypes, 4, vandCode) + + vbicCode = ''' + destElem = srcElem1 & ~srcElem2; + ''' + threeEqualRegInst("vbic", "VbicD", unsignedTypes, 2, vbicCode) + threeEqualRegInst("vbic", "VbicQ", unsignedTypes, 4, vbicCode) + + vorrCode = ''' + destElem = srcElem1 | srcElem2; + ''' + threeEqualRegInst("vorr", "VorrD", unsignedTypes, 2, vorrCode) + threeEqualRegInst("vorr", "VorrQ", unsignedTypes, 4, vorrCode) + + threeEqualRegInst("vmov", "VmovD", unsignedTypes, 2, vorrCode) + threeEqualRegInst("vmov", "VmovQ", unsignedTypes, 4, vorrCode) + + vornCode = ''' + destElem = srcElem1 | ~srcElem2; + ''' + threeEqualRegInst("vorn", "VornD", unsignedTypes, 2, vornCode) + threeEqualRegInst("vorn", "VornQ", unsignedTypes, 4, vornCode) + + veorCode = ''' + destElem = srcElem1 ^ srcElem2; + ''' + threeEqualRegInst("veor", "VeorD", unsignedTypes, 2, veorCode) + threeEqualRegInst("veor", "VeorQ", unsignedTypes, 4, veorCode) + + vbifCode = ''' + destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2); + ''' + threeEqualRegInst("vbif", "VbifD", unsignedTypes, 2, vbifCode, True) + threeEqualRegInst("vbif", "VbifQ", unsignedTypes, 4, vbifCode, True) + vbitCode = ''' + destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2); + ''' + threeEqualRegInst("vbit", "VbitD", unsignedTypes, 2, vbitCode, True) + threeEqualRegInst("vbit", "VbitQ", unsignedTypes, 4, vbitCode, True) + vbslCode = ''' + destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem); + ''' + threeEqualRegInst("vbsl", "VbslD", unsignedTypes, 2, vbslCode, True) + threeEqualRegInst("vbsl", "VbslQ", unsignedTypes, 4, vbslCode, True) + + vmaxCode = ''' + destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2; + ''' + threeEqualRegInst("vmax", "VmaxD", allTypes, 2, vmaxCode) + threeEqualRegInst("vmax", "VmaxQ", allTypes, 4, vmaxCode) + + vminCode = ''' + destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2; + ''' + threeEqualRegInst("vmin", "VminD", allTypes, 2, vminCode) + threeEqualRegInst("vmin", "VminQ", allTypes, 4, vminCode) + + vaddCode = ''' + destElem = srcElem1 + srcElem2; + ''' + threeEqualRegInst("vadd", "NVaddD", unsignedTypes, 2, vaddCode) + threeEqualRegInst("vadd", "NVaddQ", unsignedTypes, 4, vaddCode) + + threeEqualRegInst("vpadd", "NVpaddD", unsignedTypes, + 2, vaddCode, pairwise=True) + threeEqualRegInst("vpadd", "NVpaddQ", unsignedTypes, + 4, vaddCode, pairwise=True) + vaddlwCode = ''' + destElem = (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + threeRegLongInst("vaddl", "Vaddl", smallTypes, vaddlwCode) + threeRegWideInst("vaddw", "Vaddw", smallTypes, vaddlwCode) + vaddhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vaddhn", "Vaddhn", smallTypes, vaddhnCode) + vraddhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vraddhn", "Vraddhn", smallTypes, vraddhnCode) + + vsubCode = ''' + destElem = srcElem1 - srcElem2; + ''' + threeEqualRegInst("vsub", "NVsubD", unsignedTypes, 2, vsubCode) + threeEqualRegInst("vsub", "NVsubQ", unsignedTypes, 4, vsubCode) + vsublwCode = ''' + destElem = (BigElement)srcElem1 - (BigElement)srcElem2; + ''' + threeRegLongInst("vsubl", "Vsubl", smallTypes, vsublwCode) + threeRegWideInst("vsubw", "Vsubw", smallTypes, vsublwCode) + + vqaddUCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (destElem < srcElem1 || destElem < srcElem2) { + destElem = (Element)(-1); + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqadd", "VqaddUD", unsignedTypes, 2, vqaddUCode) + threeEqualRegInst("vqadd", "VqaddUQ", unsignedTypes, 4, vqaddUCode) + vsubhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vsubhn", "Vsubhn", smallTypes, vsubhnCode) + vrsubhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInst("vrsubhn", "Vrsubhn", smallTypes, vrsubhnCode) + + vqaddSCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool negSrc2 = (srcElem2 < 0); + if ((negDest != negSrc1) && (negSrc1 == negSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqadd", "VqaddSD", signedTypes, 2, vqaddSCode) + threeEqualRegInst("vqadd", "VqaddSQ", signedTypes, 4, vqaddSCode) + + vqsubUCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (destElem > srcElem1) { + destElem = 0; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqsub", "VqsubUD", unsignedTypes, 2, vqsubUCode) + threeEqualRegInst("vqsub", "VqsubUQ", unsignedTypes, 4, vqsubUCode) + + vqsubSCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool posSrc2 = (srcElem2 >= 0); + if ((negDest != negSrc1) && (negSrc1 == posSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqsub", "VqsubSD", signedTypes, 2, vqsubSCode) + threeEqualRegInst("vqsub", "VqsubSQ", signedTypes, 4, vqsubSCode) + + vcgtCode = ''' + destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vcgt", "VcgtD", allTypes, 2, vcgtCode) + threeEqualRegInst("vcgt", "VcgtQ", allTypes, 4, vcgtCode) + + vcgeCode = ''' + destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vcge", "VcgeD", allTypes, 2, vcgeCode) + threeEqualRegInst("vcge", "VcgeQ", allTypes, 4, vcgeCode) + + vceqCode = ''' + destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vceq", "VceqD", unsignedTypes, 2, vceqCode) + threeEqualRegInst("vceq", "VceqQ", unsignedTypes, 4, vceqCode) + + vshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } + ''' + threeEqualRegInst("vshl", "VshlD", allTypes, 2, vshlCode) + threeEqualRegInst("vshl", "VshlQ", allTypes, 4, vshlCode) + + vrshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } else { + destElem = srcElem1; + } + ''' + threeEqualRegInst("vrshl", "VrshlD", allTypes, 2, vrshlCode) + threeEqualRegInst("vrshl", "VrshlQ", allTypes, 4, vrshlCode) + + vqshlUCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqshl", "VqshlUD", unsignedTypes, 2, vqshlUCode) + threeEqualRegInst("vqshl", "VqshlUQ", unsignedTypes, 4, vqshlUCode) + + vqshlSCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqshl", "VqshlSD", signedTypes, 2, vqshlSCode) + threeEqualRegInst("vqshl", "VqshlSQ", signedTypes, 4, vqshlSCode) + + vqrshlUCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrshl", "VqrshlUD", unsignedTypes, 2, vqrshlUCode) + threeEqualRegInst("vqrshl", "VqrshlUQ", unsignedTypes, 4, vqrshlUCode) + + vqrshlSCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR)Fpscr; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrshl", "VqrshlSD", signedTypes, 2, vqrshlSCode) + threeEqualRegInst("vqrshl", "VqrshlSQ", signedTypes, 4, vqrshlSCode) + + vabaCode = ''' + destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInst("vaba", "VabaD", allTypes, 2, vabaCode, True) + threeEqualRegInst("vaba", "VabaQ", allTypes, 4, vabaCode, True) + vabalCode = ''' + destElem += (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInst("vabal", "Vabal", smallTypes, vabalCode, True) + + vabdCode = ''' + destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInst("vabd", "VabdD", allTypes, 2, vabdCode) + threeEqualRegInst("vabd", "VabdQ", allTypes, 4, vabdCode) + vabdlCode = ''' + destElem = (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInst("vabdl", "Vabdl", smallTypes, vabdlCode) + + vtstCode = ''' + destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0; + ''' + threeEqualRegInst("vtst", "VtstD", unsignedTypes, 2, vtstCode) + threeEqualRegInst("vtst", "VtstQ", unsignedTypes, 4, vtstCode) + + vmulCode = ''' + destElem = srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmul", "NVmulD", allTypes, 2, vmulCode) + threeEqualRegInst("vmul", "NVmulQ", allTypes, 4, vmulCode) + vmullCode = ''' + destElem = (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmull", "Vmull", smallTypes, vmullCode) + + vmlaCode = ''' + destElem = destElem + srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmla", "NVmlaD", allTypes, 2, vmlaCode, True) + threeEqualRegInst("vmla", "NVmlaQ", allTypes, 4, vmlaCode, True) + vmlalCode = ''' + destElem = destElem + (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmlal", "Vmlal", smallTypes, vmlalCode, True) + + vqdmlalCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = (destElem < 0); + destElem += midElem; + bool negDest = (destElem < 0); + bool negMid = (midElem < 0); + if (negPreDest == negMid && negMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmlal", "Vqdmlal", smallTypes, vqdmlalCode, True) + + vqdmlslCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = (destElem < 0); + destElem -= midElem; + bool negDest = (destElem < 0); + bool posMid = (midElem > 0); + if (negPreDest == posMid && posMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmlsl", "Vqdmlsl", smallTypes, vqdmlslCode, True) + + vqdmullCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (Element)(sizeof(Element) * 8 - 1))) { + destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeRegLongInst("vqdmull", "Vqdmull", smallTypes, vqdmullCode) + + vmlsCode = ''' + destElem = destElem - srcElem1 * srcElem2; + ''' + threeEqualRegInst("vmls", "NVmlsD", allTypes, 2, vmlsCode, True) + threeEqualRegInst("vmls", "NVmlsQ", allTypes, 4, vmlsCode, True) + vmlslCode = ''' + destElem = destElem - (BigElement)srcElem1 * (BigElement)srcElem2; + ''' + threeRegLongInst("vmlsl", "Vmlsl", smallTypes, vmlslCode, True) + + vmulpCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= srcElem1 << j; + } + ''' + threeEqualRegInst("vmul", "NVmulpD", unsignedTypes, 2, vmulpCode) + threeEqualRegInst("vmul", "NVmulpQ", unsignedTypes, 4, vmulpCode) + vmullpCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= (BigElement)srcElem1 << j; + } + ''' + threeRegLongInst("vmull", "Vmullp", smallUnsignedTypes, vmullpCode) + + threeEqualRegInst("vpmax", "VpmaxD", allTypes, 2, vmaxCode, pairwise=True) + threeEqualRegInst("vpmax", "VpmaxQ", allTypes, 4, vmaxCode, pairwise=True) + + threeEqualRegInst("vpmin", "VpminD", allTypes, 2, vminCode, pairwise=True) + threeEqualRegInst("vpmin", "VpminQ", allTypes, 4, vminCode, pairwise=True) + + vqdmulhCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >> + (sizeof(Element) * 8); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (sizeof(Element) * 8 - 1))) { + destElem = ~srcElem1; + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqdmulh", "VqdmulhD", smallSignedTypes, 2, vqdmulhCode) + threeEqualRegInst("vqdmulh", "VqdmulhQ", smallSignedTypes, 4, vqdmulhCode) + + vqrdmulhCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 + + ((int64_t)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + if (destElem < 0) { + destElem = mask(sizeof(Element) * 8 - 1); + } else { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + } + fpscr.qc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInst("vqrdmulh", "VqrdmulhD", + smallSignedTypes, 2, vqrdmulhCode) + threeEqualRegInst("vqrdmulh", "VqrdmulhQ", + smallSignedTypes, 4, vqrdmulhCode) + + vmaxfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + bool done; + destReg = processNans(fpscr, done, true, srcReg1, srcReg2); + if (!done) { + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMaxS, + true, true, VfpRoundNearest); + } else if (flushToZero(srcReg1, srcReg2)) { + fpscr.idc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmax", "VmaxDFp", ("float",), 2, vmaxfpCode) + threeEqualRegInstFp("vmax", "VmaxQFp", ("float",), 4, vmaxfpCode) + + vminfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + bool done; + destReg = processNans(fpscr, done, true, srcReg1, srcReg2); + if (!done) { + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMinS, + true, true, VfpRoundNearest); + } else if (flushToZero(srcReg1, srcReg2)) { + fpscr.idc = 1; + } + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmin", "VminDFp", ("float",), 2, vminfpCode) + threeEqualRegInstFp("vmin", "VminQFp", ("float",), 4, vminfpCode) + + threeEqualRegInstFp("vpmax", "VpmaxDFp", ("float",), + 2, vmaxfpCode, pairwise=True) + threeEqualRegInstFp("vpmax", "VpmaxQFp", ("float",), + 4, vmaxfpCode, pairwise=True) + + threeEqualRegInstFp("vpmin", "VpminDFp", ("float",), + 2, vminfpCode, pairwise=True) + threeEqualRegInstFp("vpmin", "VpminQFp", ("float",), + 4, vminfpCode, pairwise=True) + + vaddfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpAddS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vadd", "VaddDFp", ("float",), 2, vaddfpCode) + threeEqualRegInstFp("vadd", "VaddQFp", ("float",), 4, vaddfpCode) + + threeEqualRegInstFp("vpadd", "VpaddDFp", ("float",), + 2, vaddfpCode, pairwise=True) + threeEqualRegInstFp("vpadd", "VpaddQFp", ("float",), + 4, vaddfpCode, pairwise=True) + + vsubfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpSubS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vsub", "VsubDFp", ("float",), 2, vsubfpCode) + threeEqualRegInstFp("vsub", "VsubQFp", ("float",), 4, vsubfpCode) + + vmulfpCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmul", "NVmulDFp", ("float",), 2, vmulfpCode) + threeEqualRegInstFp("vmul", "NVmulQFp", ("float",), 4, vmulfpCode) + + vmlafpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + destReg = binaryOp(fpscr, mid, destReg, fpAddS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmla", "NVmlaDFp", ("float",), 2, vmlafpCode, True) + threeEqualRegInstFp("vmla", "NVmlaQFp", ("float",), 4, vmlafpCode, True) + + vmlsfpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS, + true, true, VfpRoundNearest); + destReg = binaryOp(fpscr, destReg, mid, fpSubS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vmls", "NVmlsDFp", ("float",), 2, vmlsfpCode, True) + threeEqualRegInstFp("vmls", "NVmlsQFp", ("float",), 4, vmlsfpCode, True) + + vcgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vcgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vcgt", "VcgtDFp", ("float",), + 2, vcgtfpCode, toInt = True) + threeEqualRegInstFp("vcgt", "VcgtQFp", ("float",), + 4, vcgtfpCode, toInt = True) + + vcgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vcgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vcge", "VcgeDFp", ("float",), + 2, vcgefpCode, toInt = True) + threeEqualRegInstFp("vcge", "VcgeQFp", ("float",), + 4, vcgefpCode, toInt = True) + + vacgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vacgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vacgt", "VacgtDFp", ("float",), + 2, vacgtfpCode, toInt = True) + threeEqualRegInstFp("vacgt", "VacgtQFp", ("float",), + 4, vacgtfpCode, toInt = True) + + vacgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vacgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vacge", "VacgeDFp", ("float",), + 2, vacgefpCode, toInt = True) + threeEqualRegInstFp("vacge", "VacgeQFp", ("float",), + 4, vacgefpCode, toInt = True) + + vceqfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, srcReg2, vceqFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vceq", "VceqDFp", ("float",), + 2, vceqfpCode, toInt = True) + threeEqualRegInstFp("vceq", "VceqQFp", ("float",), + 4, vceqfpCode, toInt = True) + + vrecpsCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRecpsS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vrecps", "VrecpsDFp", ("float",), 2, vrecpsCode) + threeEqualRegInstFp("vrecps", "VrecpsQFp", ("float",), 4, vrecpsCode) + + vrsqrtsCode = ''' + FPSCR fpscr = Fpscr; + destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRSqrtsS, + true, true, VfpRoundNearest); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vrsqrts", "VrsqrtsDFp", ("float",), 2, vrsqrtsCode) + threeEqualRegInstFp("vrsqrts", "VrsqrtsQFp", ("float",), 4, vrsqrtsCode) + + vabdfpCode = ''' + FPSCR fpscr = Fpscr; + float mid = binaryOp(fpscr, srcReg1, srcReg2, fpSubS, + true, true, VfpRoundNearest); + destReg = fabs(mid); + Fpscr = fpscr; + ''' + threeEqualRegInstFp("vabd", "VabdDFp", ("float",), 2, vabdfpCode) + threeEqualRegInstFp("vabd", "VabdQFp", ("float",), 4, vabdfpCode) + + twoEqualRegInst("vmla", "VmlasD", unsignedTypes, 2, vmlaCode, True) + twoEqualRegInst("vmla", "VmlasQ", unsignedTypes, 4, vmlaCode, True) + twoEqualRegInstFp("vmla", "VmlasDFp", ("float",), 2, vmlafpCode, True) + twoEqualRegInstFp("vmla", "VmlasQFp", ("float",), 4, vmlafpCode, True) + twoRegLongInst("vmlal", "Vmlals", smallTypes, vmlalCode, True) + + twoEqualRegInst("vmls", "VmlssD", allTypes, 2, vmlsCode, True) + twoEqualRegInst("vmls", "VmlssQ", allTypes, 4, vmlsCode, True) + twoEqualRegInstFp("vmls", "VmlssDFp", ("float",), 2, vmlsfpCode, True) + twoEqualRegInstFp("vmls", "VmlssQFp", ("float",), 4, vmlsfpCode, True) + twoRegLongInst("vmlsl", "Vmlsls", smallTypes, vmlslCode, True) + + twoEqualRegInst("vmul", "VmulsD", allTypes, 2, vmulCode) + twoEqualRegInst("vmul", "VmulsQ", allTypes, 4, vmulCode) + twoEqualRegInstFp("vmul", "VmulsDFp", ("float",), 2, vmulfpCode) + twoEqualRegInstFp("vmul", "VmulsQFp", ("float",), 4, vmulfpCode) + twoRegLongInst("vmull", "Vmulls", smallTypes, vmullCode) + + twoRegLongInst("vqdmull", "Vqdmulls", smallTypes, vqdmullCode) + twoRegLongInst("vqdmlal", "Vqdmlals", smallTypes, vqdmlalCode, True) + twoRegLongInst("vqdmlsl", "Vqdmlsls", smallTypes, vqdmlslCode, True) + twoEqualRegInst("vqdmulh", "VqdmulhsD", smallSignedTypes, 2, vqdmulhCode) + twoEqualRegInst("vqdmulh", "VqdmulhsQ", smallSignedTypes, 4, vqdmulhCode) + twoEqualRegInst("vqrdmulh", "VqrdmulhsD", + smallSignedTypes, 2, vqrdmulhCode) + twoEqualRegInst("vqrdmulh", "VqrdmulhsQ", + smallSignedTypes, 4, vqrdmulhCode) + + vshrCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + if (srcElem1 < 0) + destElem = -1; + else + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoRegShiftInst("vshr", "NVshrD", allTypes, 2, vshrCode) + twoRegShiftInst("vshr", "NVshrQ", allTypes, 4, vshrCode) + + vsraCode = ''' + Element mid;; + if (imm >= sizeof(srcElem1) * 8) { + mid = (srcElem1 < 0) ? -1 : 0; + } else { + mid = srcElem1 >> imm; + if (srcElem1 < 0 && mid >= 0) { + mid |= -(mid & ((Element)1 << + (sizeof(Element) * 8 - 1 - imm))); + } + } + destElem += mid; + ''' + twoRegShiftInst("vsra", "NVsraD", allTypes, 2, vsraCode, True) + twoRegShiftInst("vsra", "NVsraQ", allTypes, 4, vsraCode, True) + + vrshrCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoRegShiftInst("vrshr", "NVrshrD", allTypes, 2, vrshrCode) + twoRegShiftInst("vrshr", "NVrshrQ", allTypes, 4, vrshrCode) + + vrsraCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem += 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem += srcElem1; + } + ''' + twoRegShiftInst("vrsra", "NVrsraD", allTypes, 2, vrsraCode, True) + twoRegShiftInst("vrsra", "NVrsraQ", allTypes, 4, vrsraCode, True) + + vsriCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 >> imm) | + (destElem & ~mask(sizeof(Element) * 8 - imm)); + ''' + twoRegShiftInst("vsri", "NVsriD", unsignedTypes, 2, vsriCode, True) + twoRegShiftInst("vsri", "NVsriQ", unsignedTypes, 4, vsriCode, True) + + vshlCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1; + else + destElem = srcElem1 << imm; + ''' + twoRegShiftInst("vshl", "NVshlD", unsignedTypes, 2, vshlCode) + twoRegShiftInst("vshl", "NVshlQ", unsignedTypes, 4, vshlCode) + + vsliCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 << imm) | (destElem & mask(imm)); + ''' + twoRegShiftInst("vsli", "NVsliD", unsignedTypes, 2, vsliCode, True) + twoRegShiftInst("vsli", "NVsliQ", unsignedTypes, 4, vsliCode, True) + + vqshlCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - imm); + if (topBits != 0 && topBits != mask(imm + 1)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshl", "NVqshlD", signedTypes, 2, vqshlCode) + twoRegShiftInst("vqshl", "NVqshlQ", signedTypes, 4, vqshlCode) + + vqshluCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshlu", "NVqshluD", unsignedTypes, 2, vqshluCode) + twoRegShiftInst("vqshlu", "NVqshluQ", unsignedTypes, 4, vqshluCode) + + vqshlusCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (srcElem1 > 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegShiftInst("vqshlus", "NVqshlusD", signedTypes, 2, vqshlusCode) + twoRegShiftInst("vqshlus", "NVqshlusQ", signedTypes, 4, vqshlusCode) + + vshrnCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoRegNarrowShiftInst("vshrn", "NVshrn", smallUnsignedTypes, vshrnCode) + + vrshrnCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoRegNarrowShiftInst("vrshrn", "NVrshrn", smallUnsignedTypes, vrshrnCode) + + vqshrnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrn", "NVqshrn", smallSignedTypes, vqshrnCode) + + vqshrunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrun", "NVqshrun", + smallUnsignedTypes, vqshrunCode) + + vqshrunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqshrun", "NVqshruns", + smallSignedTypes, vqshrunsCode) + + vqrshrnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrn", "NVqrshrn", + smallSignedTypes, vqrshrnCode) + + vqrshrunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrun", "NVqrshrun", + smallUnsignedTypes, vqrshrunCode) + + vqrshrunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + Fpscr = fpscr; + ''' + twoRegNarrowShiftInst("vqrshrun", "NVqrshruns", + smallSignedTypes, vqrshrunsCode) + + vshllCode = ''' + if (imm >= sizeof(destElem) * 8) { + destElem = 0; + } else { + destElem = (BigElement)srcElem1 << imm; + } + ''' + twoRegLongShiftInst("vshll", "NVshll", smallTypes, vshllCode) + + vmovlCode = ''' + destElem = srcElem1; + ''' + twoRegLongShiftInst("vmovl", "NVmovl", smallTypes, vmovlCode) + + vcvt2ufxCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcElem1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + destReg = vfpFpSToFixed(srcElem1, false, false, imm); + __asm__ __volatile__("" :: "m" (destReg)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvt2ufxD", ("float",), + 2, vcvt2ufxCode, toInt = True) + twoRegShiftInst("vcvt", "NVcvt2ufxQ", ("float",), + 4, vcvt2ufxCode, toInt = True) + + vcvt2sfxCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcElem1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + destReg = vfpFpSToFixed(srcElem1, true, false, imm); + __asm__ __volatile__("" :: "m" (destReg)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvt2sfxD", ("float",), + 2, vcvt2sfxCode, toInt = True) + twoRegShiftInst("vcvt", "NVcvt2sfxQ", ("float",), + 4, vcvt2sfxCode, toInt = True) + + vcvtu2fpCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1)); + destElem = vfpUFixedToFpS(true, true, srcReg1, false, imm); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvtu2fpD", ("float",), + 2, vcvtu2fpCode, fromInt = True) + twoRegShiftInst("vcvt", "NVcvtu2fpQ", ("float",), + 4, vcvtu2fpCode, fromInt = True) + + vcvts2fpCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1)); + destElem = vfpSFixedToFpS(true, true, srcReg1, false, imm); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegShiftInst("vcvt", "NVcvts2fpD", ("float",), + 2, vcvts2fpCode, fromInt = True) + twoRegShiftInst("vcvt", "NVcvts2fpQ", ("float",), + 4, vcvts2fpCode, fromInt = True) + + vcvts2hCode = ''' + FPSCR fpscr = Fpscr; + float srcFp1 = bitsToFp(srcElem1, (float)0.0); + if (flushToZero(srcFp1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcFp1), "=m" (destElem) + : "m" (srcFp1), "m" (destElem)); + destElem = vcvtFpSFpH(fpscr, true, true, VfpRoundNearest, + fpscr.ahp, srcFp1); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vcvt", "NVcvts2h", ("uint16_t",), vcvts2hCode) + + vcvth2sCode = ''' + FPSCR fpscr = Fpscr; + VfpSavedState state = prepFpState(VfpRoundNearest); + __asm__ __volatile__("" : "=m" (srcElem1), "=m" (destElem) + : "m" (srcElem1), "m" (destElem)); + destElem = fpToBits(vcvtFpHFpS(fpscr, true, fpscr.ahp, srcElem1)); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + Fpscr = fpscr; + ''' + twoRegLongMiscInst("vcvt", "NVcvth2s", ("uint16_t",), vcvth2sCode) + + vrsqrteCode = ''' + destElem = unsignedRSqrtEstimate(srcElem1); + ''' + twoRegMiscInst("vrsqrte", "NVrsqrteD", ("uint32_t",), 2, vrsqrteCode) + twoRegMiscInst("vrsqrte", "NVrsqrteQ", ("uint32_t",), 4, vrsqrteCode) + + vrsqrtefpCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcReg1)) + fpscr.idc = 1; + destReg = fprSqrtEstimate(fpscr, srcReg1); + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vrsqrte", "NVrsqrteDFp", ("float",), 2, vrsqrtefpCode) + twoRegMiscInstFp("vrsqrte", "NVrsqrteQFp", ("float",), 4, vrsqrtefpCode) + + vrecpeCode = ''' + destElem = unsignedRecipEstimate(srcElem1); + ''' + twoRegMiscInst("vrecpe", "NVrecpeD", ("uint32_t",), 2, vrecpeCode) + twoRegMiscInst("vrecpe", "NVrecpeQ", ("uint32_t",), 4, vrecpeCode) + + vrecpefpCode = ''' + FPSCR fpscr = Fpscr; + if (flushToZero(srcReg1)) + fpscr.idc = 1; + destReg = fpRecipEstimate(fpscr, srcReg1); + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vrecpe", "NVrecpeDFp", ("float",), 2, vrecpefpCode) + twoRegMiscInstFp("vrecpe", "NVrecpeQFp", ("float",), 4, vrecpefpCode) + + vrev16Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 1) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev16", "NVrev16D", ("uint8_t",), 2, vrev16Code) + twoRegMiscInst("vrev16", "NVrev16Q", ("uint8_t",), 4, vrev16Code) + vrev32Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 2) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev32", "NVrev32D", + ("uint8_t", "uint16_t"), 2, vrev32Code) + twoRegMiscInst("vrev32", "NVrev32Q", + ("uint8_t", "uint16_t"), 4, vrev32Code) + vrev64Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 3) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoRegMiscInst("vrev64", "NVrev64D", smallUnsignedTypes, 2, vrev64Code) + twoRegMiscInst("vrev64", "NVrev64Q", smallUnsignedTypes, 4, vrev64Code) + + vpaddlCode = ''' + destElem = (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + twoRegCondenseInst("vpaddl", "NVpaddlD", smallTypes, 2, vpaddlCode) + twoRegCondenseInst("vpaddl", "NVpaddlQ", smallTypes, 4, vpaddlCode) + + vpadalCode = ''' + destElem += (BigElement)srcElem1 + (BigElement)srcElem2; + ''' + twoRegCondenseInst("vpadal", "NVpadalD", smallTypes, 2, vpadalCode, True) + twoRegCondenseInst("vpadal", "NVpadalQ", smallTypes, 4, vpadalCode, True) + + vclsCode = ''' + unsigned count = 0; + if (srcElem1 < 0) { + srcElem1 <<= 1; + while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } else { + srcElem1 <<= 1; + while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } + destElem = count; + ''' + twoRegMiscInst("vcls", "NVclsD", signedTypes, 2, vclsCode) + twoRegMiscInst("vcls", "NVclsQ", signedTypes, 4, vclsCode) + + vclzCode = ''' + unsigned count = 0; + while (srcElem1 >= 0 && count < sizeof(Element) * 8) { + count++; + srcElem1 <<= 1; + } + destElem = count; + ''' + twoRegMiscInst("vclz", "NVclzD", signedTypes, 2, vclzCode) + twoRegMiscInst("vclz", "NVclzQ", signedTypes, 4, vclzCode) + + vcntCode = ''' + unsigned count = 0; + while (srcElem1 && count < sizeof(Element) * 8) { + count += srcElem1 & 0x1; + srcElem1 >>= 1; + } + destElem = count; + ''' + twoRegMiscInst("vcnt", "NVcntD", unsignedTypes, 2, vcntCode) + twoRegMiscInst("vcnt", "NVcntQ", unsignedTypes, 4, vcntCode) + + vmvnCode = ''' + destElem = ~srcElem1; + ''' + twoRegMiscInst("vmvn", "NVmvnD", ("uint64_t",), 2, vmvnCode) + twoRegMiscInst("vmvn", "NVmvnQ", ("uint64_t",), 4, vmvnCode) + + vqabsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + Fpscr = fpscr; + ''' + twoRegMiscInst("vqabs", "NVqabsD", signedTypes, 2, vqabsCode) + twoRegMiscInst("vqabs", "NVqabsQ", signedTypes, 4, vqabsCode) + + vqnegCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else { + destElem = -srcElem1; + } + Fpscr = fpscr; + ''' + twoRegMiscInst("vqneg", "NVqnegD", signedTypes, 2, vqnegCode) + twoRegMiscInst("vqneg", "NVqnegQ", signedTypes, 4, vqnegCode) + + vabsCode = ''' + if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + ''' + twoRegMiscInst("vabs", "NVabsD", signedTypes, 2, vabsCode) + twoRegMiscInst("vabs", "NVabsQ", signedTypes, 4, vabsCode) + vabsfpCode = ''' + union + { + uint32_t i; + float f; + } cStruct; + cStruct.f = srcReg1; + cStruct.i &= mask(sizeof(Element) * 8 - 1); + destReg = cStruct.f; + ''' + twoRegMiscInstFp("vabs", "NVabsDFp", ("float",), 2, vabsfpCode) + twoRegMiscInstFp("vabs", "NVabsQFp", ("float",), 4, vabsfpCode) + + vnegCode = ''' + destElem = -srcElem1; + ''' + twoRegMiscInst("vneg", "NVnegD", signedTypes, 2, vnegCode) + twoRegMiscInst("vneg", "NVnegQ", signedTypes, 4, vnegCode) + vnegfpCode = ''' + destReg = -srcReg1; + ''' + twoRegMiscInstFp("vneg", "NVnegDFp", ("float",), 2, vnegfpCode) + twoRegMiscInstFp("vneg", "NVnegQFp", ("float",), 4, vnegfpCode) + + vcgtCode = 'destElem = (srcElem1 > 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcgt", "NVcgtD", signedTypes, 2, vcgtCode) + twoRegMiscInst("vcgt", "NVcgtQ", signedTypes, 4, vcgtCode) + vcgtfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcgtFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcgt", "NVcgtDFp", ("float",), + 2, vcgtfpCode, toInt = True) + twoRegMiscInstFp("vcgt", "NVcgtQFp", ("float",), + 4, vcgtfpCode, toInt = True) + + vcgeCode = 'destElem = (srcElem1 >= 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcge", "NVcgeD", signedTypes, 2, vcgeCode) + twoRegMiscInst("vcge", "NVcgeQ", signedTypes, 4, vcgeCode) + vcgefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcgeFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcge", "NVcgeDFp", ("float",), + 2, vcgefpCode, toInt = True) + twoRegMiscInstFp("vcge", "NVcgeQFp", ("float",), + 4, vcgefpCode, toInt = True) + + vceqCode = 'destElem = (srcElem1 == 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vceq", "NVceqD", signedTypes, 2, vceqCode) + twoRegMiscInst("vceq", "NVceqQ", signedTypes, 4, vceqCode) + vceqfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vceqFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vceq", "NVceqDFp", ("float",), + 2, vceqfpCode, toInt = True) + twoRegMiscInstFp("vceq", "NVceqQFp", ("float",), + 4, vceqfpCode, toInt = True) + + vcleCode = 'destElem = (srcElem1 <= 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vcle", "NVcleD", signedTypes, 2, vcleCode) + twoRegMiscInst("vcle", "NVcleQ", signedTypes, 4, vcleCode) + vclefpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcleFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vcle", "NVcleDFp", ("float",), + 2, vclefpCode, toInt = True) + twoRegMiscInstFp("vcle", "NVcleQFp", ("float",), + 4, vclefpCode, toInt = True) + + vcltCode = 'destElem = (srcElem1 < 0) ? mask(sizeof(Element) * 8) : 0;' + twoRegMiscInst("vclt", "NVcltD", signedTypes, 2, vcltCode) + twoRegMiscInst("vclt", "NVcltQ", signedTypes, 4, vcltCode) + vcltfpCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + float res = binaryOp(fpscr, srcReg1, (FloatReg)0.0, vcltFunc, + true, true, VfpRoundNearest); + destReg = (res == 0) ? -1 : 0; + if (res == 2.0) + fpscr.ioc = 1; + Fpscr = fpscr; + ''' + twoRegMiscInstFp("vclt", "NVcltDFp", ("float",), + 2, vcltfpCode, toInt = True) + twoRegMiscInstFp("vclt", "NVcltQFp", ("float",), + 4, vcltfpCode, toInt = True) + + vswpCode = ''' + FloatRegBits mid; + for (unsigned r = 0; r < rCount; r++) { + mid = srcReg1.regs[r]; + srcReg1.regs[r] = destReg.regs[r]; + destReg.regs[r] = mid; + } + ''' + twoRegMiscScramble("vswp", "NVswpD", ("uint64_t",), 2, vswpCode) + twoRegMiscScramble("vswp", "NVswpQ", ("uint64_t",), 4, vswpCode) + + vtrnCode = ''' + Element mid; + for (unsigned i = 0; i < eCount; i += 2) { + mid = srcReg1.elements[i]; + srcReg1.elements[i] = destReg.elements[i + 1]; + destReg.elements[i + 1] = mid; + } + ''' + twoRegMiscScramble("vtrn", "NVtrnD", unsignedTypes, 2, vtrnCode) + twoRegMiscScramble("vtrn", "NVtrnQ", unsignedTypes, 4, vtrnCode) + + vuzpCode = ''' + Element mid[eCount]; + memcpy(&mid, &srcReg1, sizeof(srcReg1)); + for (unsigned i = 0; i < eCount / 2; i++) { + srcReg1.elements[i] = destReg.elements[2 * i + 1]; + srcReg1.elements[eCount / 2 + i] = mid[2 * i + 1]; + destReg.elements[i] = destReg.elements[2 * i]; + } + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[eCount / 2 + i] = mid[2 * i]; + } + ''' + twoRegMiscScramble("vuzp", "NVuzpD", unsignedTypes, 2, vuzpCode) + twoRegMiscScramble("vuzp", "NVuzpQ", unsignedTypes, 4, vuzpCode) + + vzipCode = ''' + Element mid[eCount]; + memcpy(&mid, &destReg, sizeof(destReg)); + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[2 * i] = mid[i]; + destReg.elements[2 * i + 1] = srcReg1.elements[i]; + } + for (int i = 0; i < eCount / 2; i++) { + srcReg1.elements[2 * i] = mid[eCount / 2 + i]; + srcReg1.elements[2 * i + 1] = srcReg1.elements[eCount / 2 + i]; + } + ''' + twoRegMiscScramble("vzip", "NVzipD", unsignedTypes, 2, vzipCode) + twoRegMiscScramble("vzip", "NVzipQ", unsignedTypes, 4, vzipCode) + + vmovnCode = 'destElem = srcElem1;' + twoRegNarrowMiscInst("vmovn", "NVmovn", smallUnsignedTypes, vmovnCode) + + vdupCode = 'destElem = srcElem1;' + twoRegMiscScInst("vdup", "NVdupD", smallUnsignedTypes, 2, vdupCode) + twoRegMiscScInst("vdup", "NVdupQ", smallUnsignedTypes, 4, vdupCode) + + def vdupGprInst(name, Name, types, rCount): + global header_output, exec_output + eWalkCode = ''' + RegVect destReg; + for (unsigned i = 0; i < eCount; i++) { + destReg.elements[i] = htog((Element)Op1); + } + ''' + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + vdupGprInst("vdup", "NVdupDGpr", smallUnsignedTypes, 2) + vdupGprInst("vdup", "NVdupQGpr", smallUnsignedTypes, 4) + + vmovCode = 'destElem = imm;' + oneRegImmInst("vmov", "NVmoviD", ("uint64_t",), 2, vmovCode) + oneRegImmInst("vmov", "NVmoviQ", ("uint64_t",), 4, vmovCode) + + vorrCode = 'destElem |= imm;' + oneRegImmInst("vorr", "NVorriD", ("uint64_t",), 2, vorrCode, True) + oneRegImmInst("vorr", "NVorriQ", ("uint64_t",), 4, vorrCode, True) + + vmvnCode = 'destElem = ~imm;' + oneRegImmInst("vmvn", "NVmvniD", ("uint64_t",), 2, vmvnCode) + oneRegImmInst("vmvn", "NVmvniQ", ("uint64_t",), 4, vmvnCode) + + vbicCode = 'destElem &= ~imm;' + oneRegImmInst("vbic", "NVbiciD", ("uint64_t",), 2, vbicCode, True) + oneRegImmInst("vbic", "NVbiciQ", ("uint64_t",), 4, vbicCode, True) + + vqmovnCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovn", "NVqmovn", smallSignedTypes, vqmovnCode) + + vqmovunCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovun", "NVqmovun", + smallUnsignedTypes, vqmovunCode) + + vqmovunsCode = ''' + FPSCR fpscr = (FPSCR)Fpscr; + destElem = srcElem1; + if (srcElem1 < 0 || + ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + if (srcElem1 < 0) + destElem = ~destElem; + } + Fpscr = fpscr; + ''' + twoRegNarrowMiscInst("vqmovun", "NVqmovuns", + smallSignedTypes, vqmovunsCode) + + def buildVext(name, Name, types, rCount, op): + global header_output, exec_output + eWalkCode = ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw); + srcReg2.regs[%(reg)d] = htog(FpOp2P%(reg)d.uw); + ''' % { "reg" : reg } + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + FpDestP%(reg)d.uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "RegRegRegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "predicate_test": predicateTest }, []) + header_output += NeonRegRegRegImmOpDeclare.subst(iop) + exec_output += NeonEqualRegExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonExecDeclare.subst(substDict) + + vextCode = ''' + for (unsigned i = 0; i < eCount; i++) { + unsigned index = i + imm; + if (index < eCount) { + destReg.elements[i] = srcReg1.elements[index]; + } else { + index -= eCount; + assert(index < eCount); + destReg.elements[i] = srcReg2.elements[index]; + } + } + ''' + buildVext("vext", "NVextD", ("uint8_t",), 2, vextCode) + buildVext("vext", "NVextQ", ("uint8_t",), 4, vextCode) + + def buildVtbxl(name, Name, length, isVtbl): + global header_output, decoder_output, exec_output + code = ''' + union + { + uint8_t bytes[32]; + FloatRegBits regs[8]; + } table; + + union + { + uint8_t bytes[8]; + FloatRegBits regs[2]; + } destReg, srcReg2; + + const unsigned length = %(length)d; + const bool isVtbl = %(isVtbl)s; + + srcReg2.regs[0] = htog(FpOp2P0.uw); + srcReg2.regs[1] = htog(FpOp2P1.uw); + + destReg.regs[0] = htog(FpDestP0.uw); + destReg.regs[1] = htog(FpDestP1.uw); + ''' % { "length" : length, "isVtbl" : isVtbl } + for reg in range(8): + if reg < length * 2: + code += 'table.regs[%(reg)d] = htog(FpOp1P%(reg)d.uw);\n' % \ + { "reg" : reg } + else: + code += 'table.regs[%(reg)d] = 0;\n' % { "reg" : reg } + code += ''' + for (unsigned i = 0; i < sizeof(destReg); i++) { + uint8_t index = srcReg2.bytes[i]; + if (index < 8 * length) { + destReg.bytes[i] = table.bytes[index]; + } else { + if (isVtbl) + destReg.bytes[i] = 0; + // else destReg.bytes[i] unchanged + } + } + + FpDestP0.uw = gtoh(destReg.regs[0]); + FpDestP1.uw = gtoh(destReg.regs[1]); + ''' + iop = InstObjParams(name, Name, + "RegRegRegOp", + { "code": code, + "predicate_test": predicateTest }, []) + header_output += RegRegRegOpDeclare.subst(iop) + decoder_output += RegRegRegOpConstructor.subst(iop) + exec_output += PredOpExecute.subst(iop) + + buildVtbxl("vtbl", "NVtbl1", 1, "true") + buildVtbxl("vtbl", "NVtbl2", 2, "true") + buildVtbxl("vtbl", "NVtbl3", 3, "true") + buildVtbxl("vtbl", "NVtbl4", 4, "true") + + buildVtbxl("vtbx", "NVtbx1", 1, "false") + buildVtbxl("vtbx", "NVtbx2", 2, "false") + buildVtbxl("vtbx", "NVtbx3", 3, "false") + buildVtbxl("vtbx", "NVtbx4", 4, "false") +}}; diff --git a/src/arch/arm/isa/operands.isa b/src/arch/arm/isa/operands.isa index a086bb03c..5490a28e0 100644 --- a/src/arch/arm/isa/operands.isa +++ b/src/arch/arm/isa/operands.isa @@ -47,6 +47,7 @@ def operand_types {{ 'sw' : ('signed int', 32), 'uw' : ('unsigned int', 32), 'ud' : ('unsigned int', 64), + 'tud' : ('twin64 int', 64), 'sf' : ('float', 32), 'df' : ('float', 64) }}; @@ -96,6 +97,18 @@ def operands {{ 'FpDestP1': ('FloatReg', 'sf', '(dest + 1)', 'IsFloating', 2), 'FpDestP2': ('FloatReg', 'sf', '(dest + 2)', 'IsFloating', 2), 'FpDestP3': ('FloatReg', 'sf', '(dest + 3)', 'IsFloating', 2), + 'FpDestP4': ('FloatReg', 'sf', '(dest + 4)', 'IsFloating', 2), + 'FpDestP5': ('FloatReg', 'sf', '(dest + 5)', 'IsFloating', 2), + 'FpDestP6': ('FloatReg', 'sf', '(dest + 6)', 'IsFloating', 2), + 'FpDestP7': ('FloatReg', 'sf', '(dest + 7)', 'IsFloating', 2), + 'FpDestS0P0': ('FloatReg', 'sf', '(dest + step * 0 + 0)', 'IsFloating', 2), + 'FpDestS0P1': ('FloatReg', 'sf', '(dest + step * 0 + 1)', 'IsFloating', 2), + 'FpDestS1P0': ('FloatReg', 'sf', '(dest + step * 1 + 0)', 'IsFloating', 2), + 'FpDestS1P1': ('FloatReg', 'sf', '(dest + step * 1 + 1)', 'IsFloating', 2), + 'FpDestS2P0': ('FloatReg', 'sf', '(dest + step * 2 + 0)', 'IsFloating', 2), + 'FpDestS2P1': ('FloatReg', 'sf', '(dest + step * 2 + 1)', 'IsFloating', 2), + 'FpDestS3P0': ('FloatReg', 'sf', '(dest + step * 3 + 0)', 'IsFloating', 2), + 'FpDestS3P1': ('FloatReg', 'sf', '(dest + step * 3 + 1)', 'IsFloating', 2), 'Result': ('IntReg', 'uw', 'result', 'IsInteger', 2, maybePCRead, maybePCWrite), 'Dest2': ('IntReg', 'uw', 'dest2', 'IsInteger', 2, @@ -124,6 +137,18 @@ def operands {{ 'FpOp1P1': ('FloatReg', 'sf', '(op1 + 1)', 'IsFloating', 2), 'FpOp1P2': ('FloatReg', 'sf', '(op1 + 2)', 'IsFloating', 2), 'FpOp1P3': ('FloatReg', 'sf', '(op1 + 3)', 'IsFloating', 2), + 'FpOp1P4': ('FloatReg', 'sf', '(op1 + 4)', 'IsFloating', 2), + 'FpOp1P5': ('FloatReg', 'sf', '(op1 + 5)', 'IsFloating', 2), + 'FpOp1P6': ('FloatReg', 'sf', '(op1 + 6)', 'IsFloating', 2), + 'FpOp1P7': ('FloatReg', 'sf', '(op1 + 7)', 'IsFloating', 2), + 'FpOp1S0P0': ('FloatReg', 'sf', '(op1 + step * 0 + 0)', 'IsFloating', 2), + 'FpOp1S0P1': ('FloatReg', 'sf', '(op1 + step * 0 + 1)', 'IsFloating', 2), + 'FpOp1S1P0': ('FloatReg', 'sf', '(op1 + step * 1 + 0)', 'IsFloating', 2), + 'FpOp1S1P1': ('FloatReg', 'sf', '(op1 + step * 1 + 1)', 'IsFloating', 2), + 'FpOp1S2P0': ('FloatReg', 'sf', '(op1 + step * 2 + 0)', 'IsFloating', 2), + 'FpOp1S2P1': ('FloatReg', 'sf', '(op1 + step * 2 + 1)', 'IsFloating', 2), + 'FpOp1S3P0': ('FloatReg', 'sf', '(op1 + step * 3 + 0)', 'IsFloating', 2), + 'FpOp1S3P1': ('FloatReg', 'sf', '(op1 + step * 3 + 1)', 'IsFloating', 2), 'MiscOp1': ('ControlReg', 'uw', 'op1', (None, None, 'IsControl'), 2), 'Op2': ('IntReg', 'uw', 'op2', 'IsInteger', 2, maybePCRead, maybePCWrite), @@ -164,6 +189,7 @@ def operands {{ maybePCRead, maybeIWPCWrite), 'Fa' : ('FloatReg', 'sf', 'ura', 'IsFloating', 2), 'Rb' : ('IntReg', 'uw', 'urb', 'IsInteger', 2, maybePCRead, maybePCWrite), + 'Rc' : ('IntReg', 'uw', 'urc', 'IsInteger', 2, maybePCRead, maybePCWrite), #General Purpose Floating Point Reg Operands 'Fd': ('FloatReg', 'df', 'FD', 'IsFloating', 2), diff --git a/src/arch/arm/isa/templates/macromem.isa b/src/arch/arm/isa/templates/macromem.isa index 400342a29..5397a2637 100644 --- a/src/arch/arm/isa/templates/macromem.isa +++ b/src/arch/arm/isa/templates/macromem.isa @@ -74,7 +74,32 @@ def template MicroMemConstructor {{ //////////////////////////////////////////////////////////////////// // -// Integer = Integer op Immediate microops +// Neon load/store microops +// + +def template MicroNeonMemDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, + RegIndex _ura, uint32_t _imm, unsigned extraMemFlags) + : %(base_class)s("%(mnemonic)s", machInst, + %(op_class)s, _dest, _ura, _imm) + { + memAccessFlags |= extraMemFlags; + %(constructor)s; + } + + %(BasicExecDeclare)s + %(InitiateAccDeclare)s + %(CompleteAccDeclare)s + }; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Integer = Integer op Integer microops // def template MicroIntDeclare {{ @@ -82,13 +107,130 @@ def template MicroIntDeclare {{ { public: %(class_name)s(ExtMachInst machInst, + RegIndex _ura, RegIndex _urb, RegIndex _urc); + %(BasicExecDeclare)s + }; +}}; + +def template MicroIntConstructor {{ + %(class_name)s::%(class_name)s(ExtMachInst machInst, + RegIndex _ura, + RegIndex _urb, + RegIndex _urc) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _ura, _urb, _urc) + { + %(constructor)s; + } +}}; + +def template MicroNeonMemExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; + template + Fault %(class_name)s<%(targs)s>::initiateAcc( + %(CPU_exec_context)s *, Trace::InstRecord *) const; + template + Fault %(class_name)s<%(targs)s>::completeAcc(PacketPtr, + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +def template MicroNeonExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (de)interlacing microops +// + +def template MicroNeonMixDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1, + uint8_t _step) : + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _step) + { + %(constructor)s; + } + + %(BasicExecDeclare)s + }; +}}; + +def template MicroNeonMixExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + uint64_t resTemp = 0; + resTemp = resTemp; + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + +//////////////////////////////////////////////////////////////////// +// +// Neon (un)packing microops using a particular lane +// + +def template MicroNeonMixLaneDeclare {{ + template <class Element> + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _dest, RegIndex _op1, + uint8_t _step, unsigned _lane) : + %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _step, _lane) + { + %(constructor)s; + } + + %(BasicExecDeclare)s + }; +}}; + +//////////////////////////////////////////////////////////////////// +// +// Integer = Integer op Immediate microops +// + +def template MicroIntImmDeclare {{ + class %(class_name)s : public %(base_class)s + { + public: + %(class_name)s(ExtMachInst machInst, RegIndex _ura, RegIndex _urb, uint8_t _imm); %(BasicExecDeclare)s }; }}; -def template MicroIntConstructor {{ +def template MicroIntImmConstructor {{ %(class_name)s::%(class_name)s(ExtMachInst machInst, RegIndex _ura, RegIndex _urb, @@ -132,6 +274,52 @@ def template MacroMemConstructor {{ }}; +def template VMemMultDeclare {{ +class %(class_name)s : public %(base_class)s +{ + public: + // Constructor + %(class_name)s(ExtMachInst machInst, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm); + %(BasicExecPanic)s +}; +}}; + +def template VMemMultConstructor {{ +%(class_name)s::%(class_name)s(ExtMachInst machInst, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, width, + rn, vd, regs, inc, size, align, rm) +{ + %(constructor)s; +} +}}; + +def template VMemSingleDeclare {{ +class %(class_name)s : public %(base_class)s +{ + public: + // Constructor + %(class_name)s(ExtMachInst machInst, bool all, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm, unsigned lane = 0); + %(BasicExecPanic)s +}; +}}; + +def template VMemSingleConstructor {{ +%(class_name)s::%(class_name)s(ExtMachInst machInst, bool all, unsigned width, + RegIndex rn, RegIndex vd, unsigned regs, unsigned inc, + uint32_t size, uint32_t align, RegIndex rm, unsigned lane) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, all, width, + rn, vd, regs, inc, size, align, rm, lane) +{ + %(constructor)s; +} +}}; + def template MacroVFPMemDeclare {{ /** * Static instructions class for a store multiple instruction diff --git a/src/arch/arm/isa/templates/mem.isa b/src/arch/arm/isa/templates/mem.isa index 84cd1dd8f..686a8b0aa 100644 --- a/src/arch/arm/isa/templates/mem.isa +++ b/src/arch/arm/isa/templates/mem.isa @@ -180,6 +180,42 @@ def template LoadExecute {{ } }}; +def template NeonLoadExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + MemUnion memUnion; + uint8_t *dataPtr = memUnion.bytes; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + fault = xc->readBytes(EA, dataPtr, %(size)d, memAccessFlags); + %(memacc_code)s; + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExecute {{ Fault %(class_name)s::execute(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -217,6 +253,46 @@ def template StoreExecute {{ } }}; +def template NeonStoreExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + MemUnion memUnion; + uint8_t *dataPtr = memUnion.bytes; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + fault = xc->writeBytes(dataPtr, %(size)d, EA, + memAccessFlags, NULL); + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExExecute {{ Fault %(class_name)s::execute(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -336,6 +412,45 @@ def template StoreInitiateAcc {{ } }}; +def template NeonStoreInitiateAcc {{ + template <class Element> + Fault %(class_name)s<Element>::initiateAcc( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_decl)s; + %(mem_decl)s; + %(op_rd)s; + %(ea_code)s; + + if (%(predicate_test)s) + { + MemUnion memUnion; + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + fault = xc->writeBytes(memUnion.bytes, %(size)d, EA, + memAccessFlags, NULL); + } + + // Need to write back any potential address register update + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template LoadInitiateAcc {{ Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const @@ -363,6 +478,31 @@ def template LoadInitiateAcc {{ } }}; +def template NeonLoadInitiateAcc {{ + template <class Element> + Fault %(class_name)s<Element>::initiateAcc( + %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const + { + Addr EA; + Fault fault = NoFault; + + %(op_src_decl)s; + %(op_rd)s; + %(ea_code)s; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + fault = xc->readBytes(EA, NULL, %(size)d, memAccessFlags); + } + } else if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template LoadCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, @@ -395,6 +535,40 @@ def template LoadCompleteAcc {{ } }}; +def template NeonLoadCompleteAcc {{ + template <class Element> + Fault %(class_name)s<Element>::completeAcc( + PacketPtr pkt, %(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + + %(mem_decl)s; + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + // ARM instructions will not have a pkt if the predicate is false + MemUnion &memUnion = *(MemUnion *)pkt->getPtr<uint8_t>(); + + if (fault == NoFault) { + %(memacc_code)s; + } + + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, @@ -420,6 +594,32 @@ def template StoreCompleteAcc {{ } }}; +def template NeonStoreCompleteAcc {{ + template <class Element> + Fault %(class_name)s<Element>::completeAcc( + PacketPtr pkt, %(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + + %(op_decl)s; + %(op_rd)s; + + if (%(predicate_test)s) + { + if (fault == NoFault) { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + def template StoreExCompleteAcc {{ Fault %(class_name)s::completeAcc(PacketPtr pkt, %(CPU_exec_context)s *xc, diff --git a/src/arch/arm/isa/templates/neon.isa b/src/arch/arm/isa/templates/neon.isa new file mode 100644 index 000000000..e402979dc --- /dev/null +++ b/src/arch/arm/isa/templates/neon.isa @@ -0,0 +1,227 @@ +// -*- mode:c++ -*- + +// Copyright (c) 2010 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Gabe Black + +def template NeonRegRegRegOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _op2) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2, + uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _op2, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegImmOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, IntRegIndex _dest, uint64_t _imm) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, _dest, _imm) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonRegRegOpDeclare {{ +template <class _Element> +class %(class_name)s : public %(base_class)s +{ + protected: + typedef _Element Element; + public: + // Constructor + %(class_name)s(ExtMachInst machInst, + IntRegIndex _dest, IntRegIndex _op1) + : %(base_class)s("%(mnemonic)s", machInst, %(op_class)s, + _dest, _op1) + { + %(constructor)s; + } + + %(BasicExecDeclare)s +}; +}}; + +def template NeonExecDeclare {{ + template + Fault %(class_name)s<%(targs)s>::execute( + %(CPU_exec_context)s *, Trace::InstRecord *) const; +}}; + +def template NeonEqualRegExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + Fault fault = NoFault; + %(op_decl)s; + %(op_rd)s; + + const unsigned rCount = %(r_count)d; + const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element); + + union RegVect { + FloatRegBits regs[rCount]; + Element elements[eCount]; + }; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; + +output header {{ + uint16_t nextBiggerType(uint8_t); + uint32_t nextBiggerType(uint16_t); + uint64_t nextBiggerType(uint32_t); + int16_t nextBiggerType(int8_t); + int32_t nextBiggerType(int16_t); + int64_t nextBiggerType(int32_t); +}}; + +def template NeonUnequalRegExecute {{ + template <class Element> + Fault %(class_name)s<Element>::execute(%(CPU_exec_context)s *xc, + Trace::InstRecord *traceData) const + { + typedef typeof(nextBiggerType((Element)0)) BigElement; + Fault fault = NoFault; + %(op_decl)s; + %(op_rd)s; + + const unsigned rCount = %(r_count)d; + const unsigned eCount = rCount * sizeof(FloatRegBits) / sizeof(Element); + + union RegVect { + FloatRegBits regs[rCount]; + Element elements[eCount]; + BigElement bigElements[eCount / 2]; + }; + + union BigRegVect { + FloatRegBits regs[2 * rCount]; + BigElement elements[eCount]; + }; + + if (%(predicate_test)s) + { + %(code)s; + if (fault == NoFault) + { + %(op_wb)s; + } + } + + if (fault == NoFault && machInst.itstateMask != 0) { + xc->setMiscReg(MISCREG_ITSTATE, machInst.newItstate); + } + + return fault; + } +}}; diff --git a/src/arch/arm/isa/templates/templates.isa b/src/arch/arm/isa/templates/templates.isa index 2584ec1f2..148139225 100644 --- a/src/arch/arm/isa/templates/templates.isa +++ b/src/arch/arm/isa/templates/templates.isa @@ -60,3 +60,6 @@ //Templates for VFP instructions ##include "vfp.isa" + +//Templates for Neon instructions +##include "neon.isa" diff --git a/src/arch/arm/tlb.hh b/src/arch/arm/tlb.hh index 1bddd8497..668984591 100644 --- a/src/arch/arm/tlb.hh +++ b/src/arch/arm/tlb.hh @@ -65,20 +65,22 @@ class TLB : public BaseTLB { public: enum ArmFlags { - AlignmentMask = 0x7, + AlignmentMask = 0x1f, AlignByte = 0x0, AlignHalfWord = 0x1, AlignWord = 0x3, AlignDoubleWord = 0x7, + AlignQuadWord = 0xf, + AlignOctWord = 0x1f, - AllowUnaligned = 0x8, + AllowUnaligned = 0x20, // Priv code operating as if it wasn't - UserMode = 0x10, + UserMode = 0x40, // Because zero otherwise looks like a valid setting and may be used // accidentally, this bit must be non-zero to show it was used on // purpose. - MustBeOne = 0x20 + MustBeOne = 0x80 }; protected: typedef std::multimap<Addr, int> PageTable; |