From 6368edb281f162e4fbb0a91744992a25134135f4 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Wed, 25 Aug 2010 19:10:42 -0500 Subject: ARM: Implement all ARM SIMD instructions. --- src/arch/arm/insts/macromem.cc | 684 +++++- src/arch/arm/insts/macromem.hh | 118 +- src/arch/arm/insts/pred_inst.hh | 14 +- src/arch/arm/insts/static_inst.hh | 22 + src/arch/arm/insts/vfp.cc | 330 ++- src/arch/arm/insts/vfp.hh | 123 +- src/arch/arm/isa/decoder/thumb.isa | 2 +- src/arch/arm/isa/formats/fp.isa | 1643 +++++++++++---- src/arch/arm/isa/insts/fp.isa | 176 +- src/arch/arm/isa/insts/insts.isa | 5 +- src/arch/arm/isa/insts/macromem.isa | 499 ++++- src/arch/arm/isa/insts/neon.isa | 3343 ++++++++++++++++++++++++++++++ src/arch/arm/isa/operands.isa | 26 + src/arch/arm/isa/templates/macromem.isa | 192 +- src/arch/arm/isa/templates/mem.isa | 200 ++ src/arch/arm/isa/templates/neon.isa | 227 ++ src/arch/arm/isa/templates/templates.isa | 3 + src/arch/arm/tlb.hh | 10 +- 18 files changed, 7069 insertions(+), 548 deletions(-) create mode 100644 src/arch/arm/isa/insts/neon.isa create mode 100644 src/arch/arm/isa/templates/neon.isa (limited to 'src') diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc index 2a2412912..5602231f9 100644 --- a/src/arch/arm/insts/macromem.cc +++ b/src/arch/arm/insts/macromem.cc @@ -137,6 +137,647 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst, } } +VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool deinterleave = (elems > 1); + + if (wb) numMicroops++; + if (deinterleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex rMid = deinterleave ? NumFloatArchRegs : vd * 2; + + uint32_t noAlign = TLB::MustBeOne; + + unsigned uopIdx = 0; + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + if (deinterleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2 + 2, rMid + 4, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + } + break; + default: + panic("Bad number of elements to deinterleave %d.\n", elems); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VldSingleOp::VldSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned loadSize = eBytes * elems; + unsigned loadRegs M5_VAR_USED = (loadSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(loadRegs > 0 && loadRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (loadSize) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon1Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroLdrNeon2Uop( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroLdrNeon2Uop( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroLdrNeon3Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroLdrNeon6Uop( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroLdrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroLdrNeon12Uop( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroLdrNeon16Uop( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized load size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, loadSize); + } + } + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(loadRegs <= 2); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(loadRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to unpack %d.\n", elems); + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstMultOp::VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool interleave = (elems > 1); + + if (wb) numMicroops++; + if (interleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + uint32_t noAlign = TLB::MustBeOne; + + RegIndex rMid = interleave ? NumFloatArchRegs : vd * 2; + + unsigned uopIdx = 0; + if (interleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid + 4, vd * 2 + 2, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + } + break; + default: + panic("Bad number of elements to interleave %d.\n", elems); + } + } + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(!all); + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned storeSize = eBytes * elems; + unsigned storeRegs M5_VAR_USED = (storeSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(storeRegs > 0 && storeRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon8to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon8to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon8to4Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon6to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon6to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon6to4Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(storeRegs <= 2); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(storeRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to pack %d.\n", elems); + } + switch (storeSize) { + case 1: + microOps[uopIdx++] = new MicroStrNeon1Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroStrNeon2Uop( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroStrNeon2Uop( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroStrNeon3Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroStrNeon6Uop( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroStrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroStrNeon12Uop( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroStrNeon16Uop( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized store size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, storeSize); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, IntRegIndex rn, RegIndex vd, bool single, bool up, @@ -169,17 +810,25 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, bool tempUp = up; for (int j = 0; j < count; j++) { if (load) { - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroLdrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroLdrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } else { - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroStrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroStrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } if (!tempUp) { addr -= (single ? 4 : 8); @@ -216,7 +865,7 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, } std::string -MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +MicroIntImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); @@ -228,6 +877,19 @@ MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const return ss.str(); } +std::string +MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, ura); + ss << ", "; + printReg(ss, urb); + ss << ", "; + printReg(ss, urc); + return ss.str(); +} + std::string MicroMemOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { diff --git a/src/arch/arm/insts/macromem.hh b/src/arch/arm/insts/macromem.hh index 003f5a3fd..923e9c0a1 100644 --- a/src/arch/arm/insts/macromem.hh +++ b/src/arch/arm/insts/macromem.hh @@ -79,17 +79,67 @@ class MicroOp : public PredOp } }; +/** + * Microops for Neon loads/stores + */ +class MicroNeonMemOp : public MicroOp +{ + protected: + RegIndex dest, ura; + uint32_t imm; + unsigned memAccessFlags; + + MicroNeonMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _dest, RegIndex _ura, uint32_t _imm) + : MicroOp(mnem, machInst, __opClass), + dest(_dest), ura(_ura), imm(_imm), + memAccessFlags(TLB::MustBeOne) + { + } +}; + +/** + * Microops for Neon load/store (de)interleaving + */ +class MicroNeonMixOp : public MicroOp +{ + protected: + RegIndex dest, op1; + uint32_t step; + + MicroNeonMixOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _dest, RegIndex _op1, uint32_t _step) + : MicroOp(mnem, machInst, __opClass), + dest(_dest), op1(_op1), step(_step) + { + } +}; + +class MicroNeonMixLaneOp : public MicroNeonMixOp +{ + protected: + unsigned lane; + + MicroNeonMixLaneOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, RegIndex _dest, RegIndex _op1, + uint32_t _step, unsigned _lane) + : MicroNeonMixOp(mnem, machInst, __opClass, _dest, _op1, _step), + lane(_lane) + { + } +}; + /** * Microops of the form IntRegA = IntRegB op Imm */ -class MicroIntOp : public MicroOp +class MicroIntImmOp : public MicroOp { protected: RegIndex ura, urb; uint8_t imm; - MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, - RegIndex _ura, RegIndex _urb, uint8_t _imm) + MicroIntImmOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _ura, RegIndex _urb, uint8_t _imm) : MicroOp(mnem, machInst, __opClass), ura(_ura), urb(_urb), imm(_imm) { @@ -98,10 +148,28 @@ class MicroIntOp : public MicroOp std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; }; +/** + * Microops of the form IntRegA = IntRegB op IntRegC + */ +class MicroIntOp : public MicroOp +{ + protected: + RegIndex ura, urb, urc; + + MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + RegIndex _ura, RegIndex _urb, RegIndex _urc) + : MicroOp(mnem, machInst, __opClass), + ura(_ura), urb(_urb), urc(_urc) + { + } + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + /** * Memory microops which use IntReg + Imm addressing */ -class MicroMemOp : public MicroIntOp +class MicroMemOp : public MicroIntImmOp { protected: bool up; @@ -109,7 +177,7 @@ class MicroMemOp : public MicroIntOp MicroMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, RegIndex _ura, RegIndex _urb, bool _up, uint8_t _imm) - : MicroIntOp(mnem, machInst, __opClass, _ura, _urb, _imm), + : MicroIntImmOp(mnem, machInst, __opClass, _ura, _urb, _imm), up(_up), memAccessFlags(TLB::MustBeOne | TLB::AlignWord) { } @@ -128,6 +196,46 @@ class MacroMemOp : public PredMacroOp bool writeback, bool load, uint32_t reglist); }; +/** + * Base classes for microcoded integer memory instructions. + */ +class VldMultOp : public PredMacroOp +{ + protected: + VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm); +}; + +class VldSingleOp : public PredMacroOp +{ + protected: + VldSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + bool all, unsigned elems, RegIndex rn, RegIndex vd, + unsigned regs, unsigned inc, uint32_t size, + uint32_t align, RegIndex rm, unsigned lane); +}; + +/** + * Base class for microcoded integer memory instructions. + */ +class VstMultOp : public PredMacroOp +{ + protected: + VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned width, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm); +}; + +class VstSingleOp : public PredMacroOp +{ + protected: + VstSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + bool all, unsigned elems, RegIndex rn, RegIndex vd, + unsigned regs, unsigned inc, uint32_t size, + uint32_t align, RegIndex rm, unsigned lane); +}; + /** * Base class for microcoded floating point memory instructions. */ diff --git a/src/arch/arm/insts/pred_inst.hh b/src/arch/arm/insts/pred_inst.hh index 2cb383ad3..b7d4c4709 100644 --- a/src/arch/arm/insts/pred_inst.hh +++ b/src/arch/arm/insts/pred_inst.hh @@ -118,24 +118,26 @@ simd_modified_imm(bool op, uint8_t cmode, uint8_t data) break; case 0xe: if (op) { - bigData = (bigData << 0) | (bigData << 8) | - (bigData << 16) | (bigData << 24) | - (bigData << 32) | (bigData << 40) | - (bigData << 48) | (bigData << 56); - } else { bigData = 0; for (int i = 7; i >= 0; i--) { if (bits(data, i)) { - bigData |= (0xFF << (i * 8)); + bigData |= (ULL(0xFF) << (i * 8)); } } + } else { + bigData = (bigData << 0) | (bigData << 8) | + (bigData << 16) | (bigData << 24) | + (bigData << 32) | (bigData << 40) | + (bigData << 48) | (bigData << 56); } + break; case 0xf: if (!op) { uint64_t bVal = bits(bigData, 6) ? (0x1F) : (0x20); bigData = (bits(bigData, 5, 0) << 19) | (bVal << 25) | (bits(bigData, 7) << 31); bigData |= (bigData << 32); + break; } // Fall through default: diff --git a/src/arch/arm/insts/static_inst.hh b/src/arch/arm/insts/static_inst.hh index 5a1993b86..e98f85a3b 100644 --- a/src/arch/arm/insts/static_inst.hh +++ b/src/arch/arm/insts/static_inst.hh @@ -251,6 +251,28 @@ class ArmStaticInst : public StaticInst } } + template + static inline T + cSwap(T val, bool big) + { + const unsigned count = sizeof(T) / sizeof(E); + union { + T tVal; + E eVals[count]; + } conv; + conv.tVal = htog(val); + if (big) { + for (unsigned i = 0; i < count; i++) { + conv.eVals[i] = gtobe(conv.eVals[i]); + } + } else { + for (unsigned i = 0; i < count; i++) { + conv.eVals[i] = gtole(conv.eVals[i]); + } + } + return gtoh(conv.tVal); + } + // Perform an interworking branch. template static inline void diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc index 1968a59a9..f689204d9 100644 --- a/src/arch/arm/insts/vfp.cc +++ b/src/arch/arm/insts/vfp.cc @@ -91,6 +91,20 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const return ss.str(); } +std::string +FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, dest + FP_Base_DepTag); + ss << ", "; + printReg(ss, op1 + FP_Base_DepTag); + ss << ", "; + printReg(ss, op2 + FP_Base_DepTag); + ccprintf(ss, ", #%d", imm); + return ss.str(); +} + namespace ArmISA { @@ -117,7 +131,7 @@ prepFpState(uint32_t rMode) } void -finishVfp(FPSCR &fpscr, VfpSavedState state) +finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush) { int exceptions = fetestexcept(FeAllExceptions); bool underflow = false; @@ -134,7 +148,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state) underflow = true; fpscr.ufc = 1; } - if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) { + if ((exceptions & FeInexact) && !(underflow && flush)) { fpscr.ixc = 1; } fesetround(state); @@ -142,7 +156,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state) template fpType -fixDest(FPSCR fpscr, fpType val, fpType op1) +fixDest(bool flush, bool defaultNan, fpType val, fpType op1) { int fpClass = std::fpclassify(val); fpType junk = 0.0; @@ -150,12 +164,12 @@ fixDest(FPSCR fpscr, fpType val, fpType op1) const bool single = (sizeof(val) == sizeof(float)); const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan = std::isnan(op1); - if (!nan || (fpscr.dn == 1)) { + if (!nan || defaultNan) { val = bitsToFp(qnan, junk); } else if (nan) { val = bitsToFp(fpToBits(op1) | qnan, junk); } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + } else if (fpClass == FP_SUBNORMAL && flush == 1) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); @@ -166,13 +180,13 @@ fixDest(FPSCR fpscr, fpType val, fpType op1) } template -float fixDest(FPSCR fpscr, float val, float op1); +float fixDest(bool flush, bool defaultNan, float val, float op1); template -double fixDest(FPSCR fpscr, double val, double op1); +double fixDest(bool flush, bool defaultNan, double val, double op1); template fpType -fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { int fpClass = std::fpclassify(val); fpType junk = 0.0; @@ -183,7 +197,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + if ((!nan1 && !nan2) || defaultNan) { val = bitsToFp(qnan, junk); } else if (signal1) { val = bitsToFp(fpToBits(op1) | qnan, junk); @@ -194,7 +208,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } else if (nan2) { val = op2; } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + } else if (fpClass == FP_SUBNORMAL && flush) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); @@ -205,15 +219,17 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } template -float fixDest(FPSCR fpscr, float val, float op1, float op2); +float fixDest(bool flush, bool defaultNan, + float val, float op1, float op2); template -double fixDest(FPSCR fpscr, double val, double op1, double op2); +double fixDest(bool flush, bool defaultNan, + double val, double op1, double op2); template fpType -fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { - fpType mid = fixDest(fpscr, val, op1, op2); + fpType mid = fixDest(flush, defaultNan, val, op1, op2); const bool single = (sizeof(fpType) == sizeof(float)); const fpType junk = 0.0; if ((single && (val == bitsToFp(0x00800000, junk) || @@ -228,7 +244,7 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) temp = op1 / op2; if (flushToZero(temp)) { feraiseexcept(FeUnderflow); - if (fpscr.fz) { + if (flush) { feclearexcept(FeInexact); mid = temp; } @@ -239,9 +255,11 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) } template -float fixDivDest(FPSCR fpscr, float val, float op1, float op2); +float fixDivDest(bool flush, bool defaultNan, + float val, float op1, float op2); template -double fixDivDest(FPSCR fpscr, double val, double op1, double op2); +double fixDivDest(bool flush, bool defaultNan, + double val, double op1, double op2); float fixFpDFpSDest(FPSCR fpscr, double val) @@ -255,7 +273,7 @@ fixFpDFpSDest(FPSCR fpscr, double val) (bits(valBits, 63) << 31); op1 = bitsToFp(op1Bits, junk); } - float mid = fixDest(fpscr, (float)val, op1); + float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == (FeUnderflow | FeInexact)) { feclearexcept(FeInexact); @@ -291,7 +309,7 @@ fixFpSFpDDest(FPSCR fpscr, float val) ((uint64_t)bits(valBits, 31) << 63); op1 = bitsToFp(op1Bits, junk); } - double mid = fixDest(fpscr, (double)val, op1); + double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); if (mid == bitsToFp(ULL(0x0010000000000000), junk) || mid == bitsToFp(ULL(0x8010000000000000), junk)) { __asm__ __volatile__("" : "=m" (val) : "m" (val)); @@ -311,11 +329,10 @@ fixFpSFpDDest(FPSCR fpscr, float val) return mid; } -float -vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) +uint16_t +vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, float op) { - float junk = 0.0; - uint32_t destBits = fpToBits(dest); uint32_t opBits = fpToBits(op); // Extract the operand. bool neg = bits(opBits, 31); @@ -331,11 +348,11 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) // Signalling nan. fpscr.ioc = 1; } - if (fpscr.ahp) { + if (ahp) { mantissa = 0; exponent = 0; fpscr.ioc = 1; - } else if (fpscr.dn) { + } else if (defaultNan) { mantissa = (1 << 9); exponent = 0x1f; neg = false; @@ -346,7 +363,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } else { // Infinities. exponent = 0x1F; - if (fpscr.ahp) { + if (ahp) { fpscr.ioc = 1; mantissa = 0x3ff; } else { @@ -364,14 +381,14 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) // Denormalized. // If flush to zero is on, this shouldn't happen. - assert(fpscr.fz == 0); + assert(!flush); // Check for underflow if (inexact || fpscr.ufe) fpscr.ufc = 1; // Handle rounding. - unsigned mode = fpscr.rMode; + unsigned mode = rMode; if ((mode == VfpRoundUpward && !neg && extra) || (mode == VfpRoundDown && neg && extra) || (mode == VfpRoundNearest && @@ -416,7 +433,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } // Handle rounding. - unsigned mode = fpscr.rMode; + unsigned mode = rMode; bool nonZero = topOne || !restZeros; if ((mode == VfpRoundUpward && !neg && nonZero) || (mode == VfpRoundDown && neg && nonZero) || @@ -432,7 +449,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) } // Deal with overflow - if (fpscr.ahp) { + if (ahp) { if (exponent >= 0x20) { exponent = 0x1f; mantissa = 0x3ff; @@ -468,27 +485,17 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) replaceBits(result, 14, 10, exponent); if (neg) result |= (1 << 15); - if (top) - replaceBits(destBits, 31, 16, result); - else - replaceBits(destBits, 15, 0, result); - return bitsToFp(destBits, junk); + return result; } float -vcvtFpHFpS(FPSCR &fpscr, float op, bool top) +vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) { float junk = 0.0; - uint32_t opBits = fpToBits(op); - // Extract the operand. - if (top) - opBits = bits(opBits, 31, 16); - else - opBits = bits(opBits, 15, 0); // Extract the bitfields. - bool neg = bits(opBits, 15); - uint32_t exponent = bits(opBits, 14, 10); - uint32_t mantissa = bits(opBits, 9, 0); + bool neg = bits(op, 15); + uint32_t exponent = bits(op, 14, 10); + uint32_t mantissa = bits(op, 9, 0); // Do the conversion. if (exponent == 0) { if (mantissa != 0) { @@ -500,7 +507,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top) } } mantissa = mantissa << (23 - 10); - } else if (exponent == 0x1f && !fpscr.ahp) { + } else if (exponent == 0x1f && !ahp) { // Infinities and nans. exponent = 0xff; if (mantissa != 0) { @@ -511,7 +518,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top) fpscr.ioc = 1; mantissa |= (1 << 22); } - if (fpscr.dn) { + if (defaultNan) { mantissa &= ~mask(22); neg = false; } @@ -624,7 +631,8 @@ vfpFpSToFixed(float val, bool isSigned, bool half, } float -vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +vfpUFixedToFpS(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -633,11 +641,12 @@ vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } float -vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +vfpSFixedToFpS(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -646,7 +655,7 @@ vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } uint64_t @@ -743,7 +752,8 @@ vfpFpDToFixed(double val, bool isSigned, bool half, } double -vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +vfpUFixedToFpD(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -752,11 +762,12 @@ vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); } double -vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +vfpSFixedToFpD(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm) { fesetround(FeRoundNearest); if (half) @@ -765,14 +776,211 @@ vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); + return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); +} + +// This function implements a magic formula taken from the architecture +// reference manual. It was originally called recip_sqrt_estimate. +static double +recipSqrtEstimate(double a) +{ + int64_t q0, q1, s; + double r; + if (a < 0.5) { + q0 = (int64_t)(a * 512.0); + r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); + } else { + q1 = (int64_t)(a * 256.0); + r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); + } + s = (int64_t)(256.0 * r + 0.5); + return (double)s / 256.0; } +// This function is only intended for use in Neon instructions because +// it ignores certain bits in the FPSCR. +float +fprSqrtEstimate(FPSCR &fpscr, float op) +{ + const uint32_t qnan = 0x7fc00000; + float junk = 0.0; + int fpClass = std::fpclassify(op); + if (fpClass == FP_NAN) { + if ((fpToBits(op) & qnan) != qnan) + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_ZERO) { + fpscr.dzc = 1; + // Return infinity with the same sign as the operand. + return bitsToFp((std::signbit(op) << 31) | + (0xFF << 23) | (0 << 0), junk); + } else if (std::signbit(op)) { + // Set invalid op bit. + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_INFINITE) { + return 0.0; + } else { + uint64_t opBits = fpToBits(op); + double scaled; + if (bits(opBits, 23)) { + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), + (double)0.0); + } else { + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), + (double)0.0); + } + uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; + + uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); + + return bitsToFp((bits(estimate, 63) << 31) | + (bits(resultExp, 7, 0) << 23) | + (bits(estimate, 51, 29) << 0), junk); + } +} + +uint32_t +unsignedRSqrtEstimate(uint32_t op) +{ + if (bits(op, 31, 30) == 0) { + return -1; + } else { + double dpOp; + if (bits(op, 31)) { + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fe) << 52) | + (bits((uint64_t)op, 30, 0) << 21) | + (0 << 0), (double)0.0); + } else { + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fd) << 52) | + (bits((uint64_t)op, 29, 0) << 22) | + (0 << 0), (double)0.0); + } + uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); + return (1 << 31) | bits(estimate, 51, 21); + } +} + +// This function implements a magic formula taken from the architecture +// reference manual. It was originally called recip_estimate. + +static double +recipEstimate(double a) +{ + int64_t q, s; + double r; + q = (int64_t)(a * 512.0); + r = 1.0 / (((double)q + 0.5) / 512.0); + s = (int64_t)(256.0 * r + 0.5); + return (double)s / 256.0; +} + +// This function is only intended for use in Neon instructions because +// it ignores certain bits in the FPSCR. +float +fpRecipEstimate(FPSCR &fpscr, float op) +{ + const uint32_t qnan = 0x7fc00000; + float junk = 0.0; + int fpClass = std::fpclassify(op); + if (fpClass == FP_NAN) { + if ((fpToBits(op) & qnan) != qnan) + fpscr.ioc = 1; + return bitsToFp(qnan, junk); + } else if (fpClass == FP_INFINITE) { + return bitsToFp(std::signbit(op) << 31, junk); + } else if (fpClass == FP_ZERO) { + fpscr.dzc = 1; + // Return infinity with the same sign as the operand. + return bitsToFp((std::signbit(op) << 31) | + (0xFF << 23) | (0 << 0), junk); + } else if (fabs(op) >= pow(2.0, 126)) { + fpscr.ufc = 1; + return bitsToFp(std::signbit(op) << 31, junk); + } else { + uint64_t opBits = fpToBits(op); + double scaled; + scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | + (ULL(0x3fe) << 52) | (ULL(0) << 63), + (double)0.0); + uint64_t resultExp = 253 - bits(opBits, 30, 23); + + uint64_t estimate = fpToBits(recipEstimate(scaled)); + + return bitsToFp((bits(opBits, 31) << 31) | + (bits(resultExp, 7, 0) << 23) | + (bits(estimate, 51, 29) << 0), junk); + } +} + +uint32_t +unsignedRecipEstimate(uint32_t op) +{ + if (bits(op, 31) == 0) { + return -1; + } else { + double dpOp; + dpOp = bitsToFp((ULL(0) << 63) | + (ULL(0x3fe) << 52) | + (bits((uint64_t)op, 30, 0) << 21) | + (0 << 0), (double)0.0); + uint64_t estimate = fpToBits(recipEstimate(dpOp)); + return (1 << 31) | bits(estimate, 51, 21); + } +} + +template +fpType +FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + fpType op1, fpType op2) const +{ + done = true; + fpType junk = 0.0; + fpType dest = 0.0; + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan1 = std::isnan(op1); + const bool nan2 = std::isnan(op2); + const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); + const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); + if (nan1 || nan2) { + if (defaultNan) { + dest = bitsToFp(qnan, junk); + } else if (signal1) { + dest = bitsToFp(fpToBits(op1) | qnan, junk); + } else if (signal2) { + dest = bitsToFp(fpToBits(op2) | qnan, junk); + } else if (nan1) { + dest = op1; + } else if (nan2) { + dest = op2; + } + if (signal1 || signal2) { + fpscr.ioc = 1; + } + } else { + done = false; + } + return dest; +} + +template +float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + float op1, float op2) const; +template +double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, + double op1, double op2) const; + template fpType FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), - bool flush, uint32_t rMode) const + bool flush, bool defaultNan, uint32_t rMode) const { const bool single = (sizeof(fpType) == sizeof(float)); fpType junk = 0.0; @@ -795,7 +1003,7 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + if ((!nan1 && !nan2) || (defaultNan == 1)) { dest = bitsToFp(qnan, junk); } else if (signal1) { dest = bitsToFp(fpToBits(op1) | qnan, junk); @@ -828,18 +1036,18 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, dest = temp; } } - finishVfp(fpscr, state); + finishVfp(fpscr, state, flush); return dest; } template float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, float (*func)(float, float), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, double (*func)(double, double), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template fpType @@ -890,7 +1098,7 @@ FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), dest = temp; } } - finishVfp(fpscr, state); + finishVfp(fpscr, state, flush); return dest; } diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 57636bbfc..964b62673 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -192,10 +192,20 @@ bitsToFp(uint64_t bits, double junk) return val.fp; } +template +static bool +isSnan(fpType val) +{ + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + return std::isnan(val) && ((fpToBits(val) & qnan) != qnan); +} + typedef int VfpSavedState; VfpSavedState prepFpState(uint32_t rMode); -void finishVfp(FPSCR &fpscr, VfpSavedState state); +void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush); template fpType fixDest(FPSCR fpscr, fpType val, fpType op1); @@ -209,8 +219,9 @@ fpType fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2); float fixFpDFpSDest(FPSCR fpscr, double val); double fixFpSFpDDest(FPSCR fpscr, float val); -float vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top); -float vcvtFpHFpS(FPSCR &fpscr, float op, bool top); +uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, float op); +float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op); static inline double makeDouble(uint32_t low, uint32_t high) @@ -233,13 +244,23 @@ highFromDouble(double val) uint64_t vfpFpSToFixed(float val, bool isSigned, bool half, uint8_t imm, bool rzero = true); -float vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); -float vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm); +float vfpUFixedToFpS(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm); +float vfpSFixedToFpS(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm); uint64_t vfpFpDToFixed(double val, bool isSigned, bool half, uint8_t imm, bool rzero = true); -double vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); -double vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm); +double vfpUFixedToFpD(bool flush, bool defaultNan, + uint32_t val, bool half, uint8_t imm); +double vfpSFixedToFpD(bool flush, bool defaultNan, + int32_t val, bool half, uint8_t imm); + +float fprSqrtEstimate(FPSCR &fpscr, float op); +uint32_t unsignedRSqrtEstimate(uint32_t op); + +float fpRecipEstimate(FPSCR &fpscr, float op); +uint32_t unsignedRecipEstimate(uint32_t op); class VfpMacroOp : public PredMacroOp { @@ -312,6 +333,66 @@ fpMulD(double a, double b) return a * b; } +static inline float +fpMaxS(float a, float b) +{ + // Handle comparisons of +0 and -0. + if (!std::signbit(a) && std::signbit(b)) + return a; + return fmaxf(a, b); +} + +static inline float +fpMinS(float a, float b) +{ + // Handle comparisons of +0 and -0. + if (std::signbit(a) && !std::signbit(b)) + return a; + return fminf(a, b); +} + +static inline float +fpRSqrtsS(float a, float b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + float aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 1.5; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 1.5; + } + return (3.0 - (a * b)) / 2.0; +} + +static inline float +fpRecpsS(float a, float b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + float aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 2.0; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 2.0; + } + return 2.0 - (a * b); +} + class FpOp : public PredOp { protected: @@ -362,11 +443,16 @@ class FpOp : public PredOp return fpToBits(val) >> 32; } + template + fpType + processNans(FPSCR &fpscr, bool &done, bool defaultNan, + fpType op1, fpType op2) const; + template fpType binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), - bool flush, uint32_t rMode) const; + bool flush, bool defaultNan, uint32_t rMode) const; template fpType @@ -445,6 +531,27 @@ class FpRegRegRegOp : public FpOp std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; }; +class FpRegRegRegImmOp : public FpOp +{ + protected: + IntRegIndex dest; + IntRegIndex op1; + IntRegIndex op2; + uint64_t imm; + + FpRegRegRegImmOp(const char *mnem, ExtMachInst _machInst, + OpClass __opClass, IntRegIndex _dest, + IntRegIndex _op1, IntRegIndex _op2, + uint64_t _imm, VfpMicroMode mode = VfpNotAMicroop) : + FpOp(mnem, _machInst, __opClass), + dest(_dest), op1(_op1), op2(_op2), imm(_imm) + { + setVfpMicroFlags(mode, flags); + } + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + } #endif //__ARCH_ARM_INSTS_VFP_HH__ diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa index 65ea7e30c..d0f5b8646 100644 --- a/src/arch/arm/isa/decoder/thumb.isa +++ b/src/arch/arm/isa/decoder/thumb.isa @@ -88,7 +88,7 @@ decode BIGTHUMB { 0xf: McrMrc15::mcrMrc15(); } } - 0x3: WarnUnimpl::Advanced_SIMD(); + 0x3: ThumbNeonData::ThumbNeonData(); default: decode LTCOPROC { 0xa, 0xb: ExtensionRegLoadStore::extensionRegLoadStre(); 0xf: decode HTOPCODE_9_4 { diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa index 0a5f77e6e..1482c2119 100644 --- a/src/arch/arm/isa/formats/fp.isa +++ b/src/arch/arm/isa/formats/fp.isa @@ -45,6 +45,52 @@ // Floating Point operate instructions // +output header {{ + + template