From 6368edb281f162e4fbb0a91744992a25134135f4 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Wed, 25 Aug 2010 19:10:42 -0500 Subject: ARM: Implement all ARM SIMD instructions. --- src/arch/arm/insts/macromem.cc | 684 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 673 insertions(+), 11 deletions(-) (limited to 'src/arch/arm/insts/macromem.cc') diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc index 2a2412912..5602231f9 100644 --- a/src/arch/arm/insts/macromem.cc +++ b/src/arch/arm/insts/macromem.cc @@ -137,6 +137,647 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst, } } +VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool deinterleave = (elems > 1); + + if (wb) numMicroops++; + if (deinterleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex rMid = deinterleave ? NumFloatArchRegs : vd * 2; + + uint32_t noAlign = TLB::MustBeOne; + + unsigned uopIdx = 0; + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + if (deinterleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2 + 2, rMid + 4, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, vd * 2, rMid, inc * 2); + } + break; + default: + panic("Bad number of elements to deinterleave %d.\n", elems); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VldSingleOp::VldSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned loadSize = eBytes * elems; + unsigned loadRegs M5_VAR_USED = (loadSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(loadRegs > 0 && loadRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (loadSize) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon1Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroLdrNeon2Uop( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroLdrNeon2Uop( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroLdrNeon3Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroLdrNeon6Uop( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroLdrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroLdrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroLdrNeon12Uop( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroLdrNeon16Uop( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized load size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, loadSize); + } + } + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to8Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to8Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon4to6Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon4to6Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(loadRegs <= 2); + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = new MicroUnpackNeon2to4Uop( + machInst, vd * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(loadRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 1: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + case 2: + if (all) { + microOps[uopIdx++] = + new MicroUnpackAllNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2); + } else { + microOps[uopIdx++] = + new MicroUnpackNeon2to2Uop( + machInst, (vd + offset) * 2, ufp0, inc * 2, lane); + } + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to unpack %d.\n", elems); + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstMultOp::VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, + unsigned elems, RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, RegIndex rm) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + numMicroops = (regs > 2) ? 2 : 1; + bool wb = (rm != 15); + bool interleave = (elems > 1); + + if (wb) numMicroops++; + if (interleave) numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + uint32_t noAlign = TLB::MustBeOne; + + RegIndex rMid = interleave ? NumFloatArchRegs : vd * 2; + + unsigned uopIdx = 0; + if (interleave) { + switch (elems) { + case 4: + assert(regs == 4); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 3: + assert(regs == 3); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + break; + case 2: + assert(regs == 4 || regs == 2); + if (regs == 4) { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid + 4, vd * 2 + 2, inc * 2); + } else { + microOps[uopIdx++] = newNeonMixInst( + size, machInst, rMid, vd * 2, inc * 2); + } + break; + default: + panic("Bad number of elements to interleave %d.\n", elems); + } + } + switch (regs) { + case 4: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 3: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid + 4, rn, 16, noAlign); + break; + case 2: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + case 1: + microOps[uopIdx++] = newNeonMemInst( + size, machInst, rMid, rn, 0, align); + break; + default: + panic("Unrecognized number of registers %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, regs * 8); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + +VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst, + OpClass __opClass, bool all, unsigned elems, + RegIndex rn, RegIndex vd, unsigned regs, + unsigned inc, uint32_t size, uint32_t align, + RegIndex rm, unsigned lane) : + PredMacroOp(mnem, machInst, __opClass) +{ + assert(!all); + assert(regs > 0 && regs <= 4); + assert(regs % elems == 0); + + unsigned eBytes = (1 << size); + unsigned storeSize = eBytes * elems; + unsigned storeRegs M5_VAR_USED = (storeSize + sizeof(FloatRegBits) - 1) / + sizeof(FloatRegBits); + + assert(storeRegs > 0 && storeRegs <= 4); + + numMicroops = 1; + bool wb = (rm != 15); + + if (wb) numMicroops++; + numMicroops += (regs / elems); + microOps = new StaticInstPtr[numMicroops]; + + RegIndex ufp0 = NumFloatArchRegs; + + unsigned uopIdx = 0; + switch (elems) { + case 4: + assert(regs == 4); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon8to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon8to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon8to4Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 3: + assert(regs == 3); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon6to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon6to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon6to4Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 2: + assert(regs == 2); + assert(storeRegs <= 2); + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon4to2Uop( + machInst, ufp0, vd * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + break; + case 1: + assert(regs == 1 || (all && regs == 2)); + assert(storeRegs <= 2); + for (unsigned offset = 0; offset < regs; offset++) { + switch (size) { + case 0: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 1: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + case 2: + microOps[uopIdx++] = new MicroPackNeon2to2Uop( + machInst, ufp0, (vd + offset) * 2, inc * 2, lane); + break; + default: + panic("Bad size %d.\n", size); + break; + } + } + break; + default: + panic("Bad number of elements to pack %d.\n", elems); + } + switch (storeSize) { + case 1: + microOps[uopIdx++] = new MicroStrNeon1Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + if (eBytes == 2) { + microOps[uopIdx++] = new MicroStrNeon2Uop( + machInst, ufp0, rn, 0, align); + } else { + microOps[uopIdx++] = new MicroStrNeon2Uop( + machInst, ufp0, rn, 0, align); + } + break; + case 3: + microOps[uopIdx++] = new MicroStrNeon3Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + switch (eBytes) { + case 1: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 2: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon4Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 6: + microOps[uopIdx++] = new MicroStrNeon6Uop( + machInst, ufp0, rn, 0, align); + break; + case 8: + switch (eBytes) { + case 2: + microOps[uopIdx++] = new MicroStrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + case 4: + microOps[uopIdx++] = new MicroStrNeon8Uop( + machInst, ufp0, rn, 0, align); + break; + } + break; + case 12: + microOps[uopIdx++] = new MicroStrNeon12Uop( + machInst, ufp0, rn, 0, align); + break; + case 16: + microOps[uopIdx++] = new MicroStrNeon16Uop( + machInst, ufp0, rn, 0, align); + break; + default: + panic("Unrecognized store size %d.\n", regs); + } + if (wb) { + if (rm != 15 && rm != 13) { + microOps[uopIdx++] = + new MicroAddUop(machInst, rn, rn, rm); + } else { + microOps[uopIdx++] = + new MicroAddiUop(machInst, rn, rn, storeSize); + } + } + assert(uopIdx == numMicroops); + + for (unsigned i = 0; i < numMicroops - 1; i++) { + MicroOp * uopPtr = dynamic_cast(microOps[i].get()); + assert(uopPtr); + uopPtr->setDelayedCommit(); + } + microOps[numMicroops - 1]->setLastMicroop(); +} + MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass, IntRegIndex rn, RegIndex vd, bool single, bool up, @@ -169,17 +810,25 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, bool tempUp = up; for (int j = 0; j < count; j++) { if (load) { - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroLdrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroLdrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } else { - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, - tempUp, addr); - if (!single) - microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, tempUp, - addr + (up ? 4 : -4)); + if (single) { + microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, + tempUp, addr); + } else { + microOps[i++] = new MicroStrDBFpUop(machInst, vd++, rn, + tempUp, addr); + microOps[i++] = new MicroStrDTFpUop(machInst, vd++, rn, tempUp, + addr + (up ? 4 : -4)); + } } if (!tempUp) { addr -= (single ? 4 : 8); @@ -216,7 +865,7 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst, } std::string -MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +MicroIntImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); @@ -228,6 +877,19 @@ MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const return ss.str(); } +std::string +MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, ura); + ss << ", "; + printReg(ss, urb); + ss << ", "; + printReg(ss, urc); + return ss.str(); +} + std::string MicroMemOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { -- cgit v1.2.3