ARM: Implement all ARM SIMD instructions.

author: Gabe Black <gblack@eecs.umich.edu> 2010-08-25 19:10:42 -0500
committer: Gabe Black <gblack@eecs.umich.edu> 2010-08-25 19:10:42 -0500
commit: 6368edb281f162e4fbb0a91744992a25134135f4 (patch)
tree: e84dfa7d10903e6c7a56e01cc6ca23f4b0d41908 /src/arch/arm/insts
parent: f4f6b31df1a8787a12d71108eac24543bdf541e3 (diff)
download: gem5-6368edb281f162e4fbb0a91744992a25134135f4.tar.xz
6 files changed, 1200 insertions, 91 deletions
diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc
index 2a2412912..5602231f9 100644
--- a/src/arch/arm/insts/macromem.cc
+++ b/src/arch/arm/insts/macromem.cc
@@ -137,6 +137,647 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst,
     }
 }
 
+VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                     unsigned elems, RegIndex rn, RegIndex vd, unsigned regs,
+                     unsigned inc, uint32_t size, uint32_t align, RegIndex rm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    assert(regs > 0 && regs <= 4);
+    assert(regs % elems == 0);
+
+    numMicroops = (regs > 2) ? 2 : 1;
+    bool wb = (rm != 15);
+    bool deinterleave = (elems > 1);
+
+    if (wb) numMicroops++;
+    if (deinterleave) numMicroops += (regs / elems);
+    microOps = new StaticInstPtr[numMicroops];
+
+    RegIndex rMid = deinterleave ? NumFloatArchRegs : vd * 2;
+
+    uint32_t noAlign = TLB::MustBeOne;
+
+    unsigned uopIdx = 0;
+    switch (regs) {
+      case 4:
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>(
+                size, machInst, rMid + 4, rn, 16, noAlign);
+        break;
+      case 3:
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon8Uop>(
+                size, machInst, rMid + 4, rn, 16, noAlign);
+        break;
+      case 2:
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        break;
+      case 1:
+        microOps[uopIdx++] = newNeonMemInst<MicroLdrNeon8Uop>(
+                size, machInst, rMid, rn, 0, align);
+        break;
+      default:
+        panic("Unrecognized number of registers %d.\n", regs);
+    }
+    if (wb) {
+        if (rm != 15 && rm != 13) {
+            microOps[uopIdx++] =
+                new MicroAddUop(machInst, rn, rn, rm);
+        } else {
+            microOps[uopIdx++] =
+                new MicroAddiUop(machInst, rn, rn, regs * 8);
+        }
+    }
+    if (deinterleave) {
+        switch (elems) {
+          case 4:
+            assert(regs == 4);
+            microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon8Uop>(
+                    size, machInst, vd * 2, rMid, inc * 2);
+            break;
+          case 3:
+            assert(regs == 3);
+            microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon6Uop>(
+                    size, machInst, vd * 2, rMid, inc * 2);
+            break;
+          case 2:
+            assert(regs == 4 || regs == 2);
+            if (regs == 4) {
+                microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>(
+                        size, machInst, vd * 2, rMid, inc * 2);
+                microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>(
+                        size, machInst, vd * 2 + 2, rMid + 4, inc * 2);
+            } else {
+                microOps[uopIdx++] = newNeonMixInst<MicroDeintNeon4Uop>(
+                        size, machInst, vd * 2, rMid, inc * 2);
+            }
+            break;
+          default:
+            panic("Bad number of elements to deinterleave %d.\n", elems);
+        }
+    }
+    assert(uopIdx == numMicroops);
+
+    for (unsigned i = 0; i < numMicroops - 1; i++) {
+        MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get());
+        assert(uopPtr);
+        uopPtr->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VldSingleOp::VldSingleOp(const char *mnem, ExtMachInst machInst,
+                         OpClass __opClass, bool all, unsigned elems,
+                         RegIndex rn, RegIndex vd, unsigned regs,
+                         unsigned inc, uint32_t size, uint32_t align,
+                         RegIndex rm, unsigned lane) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    assert(regs > 0 && regs <= 4);
+    assert(regs % elems == 0);
+
+    unsigned eBytes = (1 << size);
+    unsigned loadSize = eBytes * elems;
+    unsigned loadRegs M5_VAR_USED = (loadSize + sizeof(FloatRegBits) - 1) /
+                        sizeof(FloatRegBits);
+
+    assert(loadRegs > 0 && loadRegs <= 4);
+
+    numMicroops = 1;
+    bool wb = (rm != 15);
+
+    if (wb) numMicroops++;
+    numMicroops += (regs / elems);
+    microOps = new StaticInstPtr[numMicroops];
+
+    RegIndex ufp0 = NumFloatArchRegs;
+
+    unsigned uopIdx = 0;
+    switch (loadSize) {
+      case 1:
+        microOps[uopIdx++] = new MicroLdrNeon1Uop<uint8_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 2:
+        if (eBytes == 2) {
+            microOps[uopIdx++] = new MicroLdrNeon2Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+        } else {
+            microOps[uopIdx++] = new MicroLdrNeon2Uop<uint8_t>(
+                    machInst, ufp0, rn, 0, align);
+        }
+        break;
+      case 3:
+        microOps[uopIdx++] = new MicroLdrNeon3Uop<uint8_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 4:
+        switch (eBytes) {
+          case 1:
+            microOps[uopIdx++] = new MicroLdrNeon4Uop<uint8_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 2:
+            microOps[uopIdx++] = new MicroLdrNeon4Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 4:
+            microOps[uopIdx++] = new MicroLdrNeon4Uop<uint32_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+        }
+        break;
+      case 6:
+        microOps[uopIdx++] = new MicroLdrNeon6Uop<uint16_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 8:
+        switch (eBytes) {
+          case 2:
+            microOps[uopIdx++] = new MicroLdrNeon8Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 4:
+            microOps[uopIdx++] = new MicroLdrNeon8Uop<uint32_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+        }
+        break;
+      case 12:
+        microOps[uopIdx++] = new MicroLdrNeon12Uop<uint32_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 16:
+        microOps[uopIdx++] = new MicroLdrNeon16Uop<uint32_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      default:
+        panic("Unrecognized load size %d.\n", regs);
+    }
+    if (wb) {
+        if (rm != 15 && rm != 13) {
+            microOps[uopIdx++] =
+                new MicroAddUop(machInst, rn, rn, rm);
+        } else {
+            microOps[uopIdx++] =
+                new MicroAddiUop(machInst, rn, rn, loadSize);
+        }
+    }
+    switch (elems) {
+      case 4:
+        assert(regs == 4);
+        switch (size) {
+          case 0:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to8Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 1:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to8Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to8Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 2:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon4to8Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon4to8Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 3:
+        assert(regs == 3);
+        switch (size) {
+          case 0:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to6Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 1:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to6Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to6Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 2:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon4to6Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon4to6Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 2:
+        assert(regs == 2);
+        assert(loadRegs <= 2);
+        switch (size) {
+          case 0:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint8_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 1:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint16_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          case 2:
+            if (all) {
+                microOps[uopIdx++] = new MicroUnpackAllNeon2to4Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2);
+            } else {
+                microOps[uopIdx++] = new MicroUnpackNeon2to4Uop<uint32_t>(
+                        machInst, vd * 2, ufp0, inc * 2, lane);
+            }
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 1:
+        assert(regs == 1 || (all && regs == 2));
+        assert(loadRegs <= 2);
+        for (unsigned offset = 0; offset < regs; offset++) {
+            switch (size) {
+              case 0:
+                if (all) {
+                    microOps[uopIdx++] =
+                        new MicroUnpackAllNeon2to2Uop<uint8_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2);
+                } else {
+                    microOps[uopIdx++] =
+                        new MicroUnpackNeon2to2Uop<uint8_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2, lane);
+                }
+                break;
+              case 1:
+                if (all) {
+                    microOps[uopIdx++] =
+                        new MicroUnpackAllNeon2to2Uop<uint16_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2);
+                } else {
+                    microOps[uopIdx++] =
+                        new MicroUnpackNeon2to2Uop<uint16_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2, lane);
+                }
+                break;
+              case 2:
+                if (all) {
+                    microOps[uopIdx++] =
+                        new MicroUnpackAllNeon2to2Uop<uint32_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2);
+                } else {
+                    microOps[uopIdx++] =
+                        new MicroUnpackNeon2to2Uop<uint32_t>(
+                            machInst, (vd + offset) * 2, ufp0, inc * 2, lane);
+                }
+                break;
+              default:
+                panic("Bad size %d.\n", size);
+                break;
+            }
+        }
+        break;
+      default:
+        panic("Bad number of elements to unpack %d.\n", elems);
+    }
+    assert(uopIdx == numMicroops);
+
+    for (unsigned i = 0; i < numMicroops - 1; i++) {
+        MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get());
+        assert(uopPtr);
+        uopPtr->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VstMultOp::VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                     unsigned elems, RegIndex rn, RegIndex vd, unsigned regs,
+                     unsigned inc, uint32_t size, uint32_t align, RegIndex rm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    assert(regs > 0 && regs <= 4);
+    assert(regs % elems == 0);
+
+    numMicroops = (regs > 2) ? 2 : 1;
+    bool wb = (rm != 15);
+    bool interleave = (elems > 1);
+
+    if (wb) numMicroops++;
+    if (interleave) numMicroops += (regs / elems);
+    microOps = new StaticInstPtr[numMicroops];
+
+    uint32_t noAlign = TLB::MustBeOne;
+
+    RegIndex rMid = interleave ? NumFloatArchRegs : vd * 2;
+
+    unsigned uopIdx = 0;
+    if (interleave) {
+        switch (elems) {
+          case 4:
+            assert(regs == 4);
+            microOps[uopIdx++] = newNeonMixInst<MicroInterNeon8Uop>(
+                    size, machInst, rMid, vd * 2, inc * 2);
+            break;
+          case 3:
+            assert(regs == 3);
+            microOps[uopIdx++] = newNeonMixInst<MicroInterNeon6Uop>(
+                    size, machInst, rMid, vd * 2, inc * 2);
+            break;
+          case 2:
+            assert(regs == 4 || regs == 2);
+            if (regs == 4) {
+                microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>(
+                        size, machInst, rMid, vd * 2, inc * 2);
+                microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>(
+                        size, machInst, rMid + 4, vd * 2 + 2, inc * 2);
+            } else {
+                microOps[uopIdx++] = newNeonMixInst<MicroInterNeon4Uop>(
+                        size, machInst, rMid, vd * 2, inc * 2);
+            }
+            break;
+          default:
+            panic("Bad number of elements to interleave %d.\n", elems);
+        }
+    }
+    switch (regs) {
+      case 4:
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>(
+                size, machInst, rMid + 4, rn, 16, noAlign);
+        break;
+      case 3:
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon8Uop>(
+                size, machInst, rMid + 4, rn, 16, noAlign);
+        break;
+      case 2:
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon16Uop>(
+                size, machInst, rMid, rn, 0, align);
+        break;
+      case 1:
+        microOps[uopIdx++] = newNeonMemInst<MicroStrNeon8Uop>(
+                size, machInst, rMid, rn, 0, align);
+        break;
+      default:
+        panic("Unrecognized number of registers %d.\n", regs);
+    }
+    if (wb) {
+        if (rm != 15 && rm != 13) {
+            microOps[uopIdx++] =
+                new MicroAddUop(machInst, rn, rn, rm);
+        } else {
+            microOps[uopIdx++] =
+                new MicroAddiUop(machInst, rn, rn, regs * 8);
+        }
+    }
+    assert(uopIdx == numMicroops);
+
+    for (unsigned i = 0; i < numMicroops - 1; i++) {
+        MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get());
+        assert(uopPtr);
+        uopPtr->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst,
+                         OpClass __opClass, bool all, unsigned elems,
+                         RegIndex rn, RegIndex vd, unsigned regs,
+                         unsigned inc, uint32_t size, uint32_t align,
+                         RegIndex rm, unsigned lane) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    assert(!all);
+    assert(regs > 0 && regs <= 4);
+    assert(regs % elems == 0);
+
+    unsigned eBytes = (1 << size);
+    unsigned storeSize = eBytes * elems;
+    unsigned storeRegs M5_VAR_USED = (storeSize + sizeof(FloatRegBits) - 1) /
+                         sizeof(FloatRegBits);
+
+    assert(storeRegs > 0 && storeRegs <= 4);
+
+    numMicroops = 1;
+    bool wb = (rm != 15);
+
+    if (wb) numMicroops++;
+    numMicroops += (regs / elems);
+    microOps = new StaticInstPtr[numMicroops];
+
+    RegIndex ufp0 = NumFloatArchRegs;
+
+    unsigned uopIdx = 0;
+    switch (elems) {
+      case 4:
+        assert(regs == 4);
+        switch (size) {
+          case 0:
+            microOps[uopIdx++] = new MicroPackNeon8to2Uop<uint8_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 1:
+            microOps[uopIdx++] = new MicroPackNeon8to2Uop<uint16_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 2:
+            microOps[uopIdx++] = new MicroPackNeon8to4Uop<uint32_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 3:
+        assert(regs == 3);
+        switch (size) {
+          case 0:
+            microOps[uopIdx++] = new MicroPackNeon6to2Uop<uint8_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 1:
+            microOps[uopIdx++] = new MicroPackNeon6to2Uop<uint16_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 2:
+            microOps[uopIdx++] = new MicroPackNeon6to4Uop<uint32_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 2:
+        assert(regs == 2);
+        assert(storeRegs <= 2);
+        switch (size) {
+          case 0:
+            microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint8_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 1:
+            microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint16_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          case 2:
+            microOps[uopIdx++] = new MicroPackNeon4to2Uop<uint32_t>(
+                    machInst, ufp0, vd * 2, inc * 2, lane);
+            break;
+          default:
+            panic("Bad size %d.\n", size);
+            break;
+        }
+        break;
+      case 1:
+        assert(regs == 1 || (all && regs == 2));
+        assert(storeRegs <= 2);
+        for (unsigned offset = 0; offset < regs; offset++) {
+            switch (size) {
+              case 0:
+                microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint8_t>(
+                        machInst, ufp0, (vd + offset) * 2, inc * 2, lane);
+                break;
+              case 1:
+                microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint16_t>(
+                        machInst, ufp0, (vd + offset) * 2, inc * 2, lane);
+                break;
+              case 2:
+                microOps[uopIdx++] = new MicroPackNeon2to2Uop<uint32_t>(
+                        machInst, ufp0, (vd + offset) * 2, inc * 2, lane);
+                break;
+              default:
+                panic("Bad size %d.\n", size);
+                break;
+            }
+        }
+        break;
+      default:
+        panic("Bad number of elements to pack %d.\n", elems);
+    }
+    switch (storeSize) {
+      case 1:
+        microOps[uopIdx++] = new MicroStrNeon1Uop<uint8_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 2:
+        if (eBytes == 2) {
+            microOps[uopIdx++] = new MicroStrNeon2Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+        } else {
+            microOps[uopIdx++] = new MicroStrNeon2Uop<uint8_t>(
+                    machInst, ufp0, rn, 0, align);
+        }
+        break;
+      case 3:
+        microOps[uopIdx++] = new MicroStrNeon3Uop<uint8_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 4:
+        switch (eBytes) {
+          case 1:
+            microOps[uopIdx++] = new MicroStrNeon4Uop<uint8_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 2:
+            microOps[uopIdx++] = new MicroStrNeon4Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 4:
+            microOps[uopIdx++] = new MicroStrNeon4Uop<uint32_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+        }
+        break;
+      case 6:
+        microOps[uopIdx++] = new MicroStrNeon6Uop<uint16_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 8:
+        switch (eBytes) {
+          case 2:
+            microOps[uopIdx++] = new MicroStrNeon8Uop<uint16_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+          case 4:
+            microOps[uopIdx++] = new MicroStrNeon8Uop<uint32_t>(
+                    machInst, ufp0, rn, 0, align);
+            break;
+        }
+        break;
+      case 12:
+        microOps[uopIdx++] = new MicroStrNeon12Uop<uint32_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      case 16:
+        microOps[uopIdx++] = new MicroStrNeon16Uop<uint32_t>(
+                machInst, ufp0, rn, 0, align);
+        break;
+      default:
+        panic("Unrecognized store size %d.\n", regs);
+    }
+    if (wb) {
+        if (rm != 15 && rm != 13) {
+            microOps[uopIdx++] =
+                new MicroAddUop(machInst, rn, rn, rm);
+        } else {
+            microOps[uopIdx++] =
+                new MicroAddiUop(machInst, rn, rn, storeSize);
+        }
+    }
+    assert(uopIdx == numMicroops);
+
+    for (unsigned i = 0; i < numMicroops - 1; i++) {
+        MicroOp * uopPtr = dynamic_cast<MicroOp *>(microOps[i].get());
+        assert(uopPtr);
+        uopPtr->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
 MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst,
                              OpClass __opClass, IntRegIndex rn,
                              RegIndex vd, bool single, bool up,
@@ -169,17 +810,25 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst,
     bool tempUp = up;
     for (int j = 0; j < count; j++) {
         if (load) {
-            microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn,
-                                              tempUp, addr);
-            if (!single)
-                microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn, tempUp,
-                                                  addr + (up ? 4 : -4));
+            if (single) {
+                microOps[i++] = new MicroLdrFpUop(machInst, vd++, rn,
+                                                  tempUp, addr);
+            } else {
+                microOps[i++] = new MicroLdrDBFpUop(machInst, vd++, rn,
+                                                    tempUp, addr);
+                microOps[i++] = new MicroLdrDTFpUop(machInst, vd++, rn, tempUp,
+                                                    addr + (up ? 4 : -4));
+            }
         } else {
-            microOps[i++] = new MicroStrFpUop(machInst, vd++, rn,
-                                              tempUp, addr);
-            if (!single)
-                microOps[i++] = new MicroStrFpUop(machInst, vd++, rn, tempUp,
-                                                  addr + (up ? 4 : -4));
+            if (single) {
+                microOps[i++] = new MicroStrFpUop(machInst, vd++, rn,
+                                                  tempUp, addr);
+            } else {
+                microOps[i++] = new MicroStrDBFpUop(machInst, vd++, rn,
+                                                    tempUp, addr);
+                microOps[i++] = new MicroStrDTFpUop(machInst, vd++, rn, tempUp,
+                                                    addr + (up ? 4 : -4));
+            }
         }
         if (!tempUp) {
             addr -= (single ? 4 : 8);
@@ -216,7 +865,7 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst,
 }
 
 std::string
-MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+MicroIntImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
     printMnemonic(ss);
@@ -229,6 +878,19 @@ MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+MicroIntOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, ura);
+    ss << ", ";
+    printReg(ss, urb);
+    ss << ", ";
+    printReg(ss, urc);
+    return ss.str();
+}
+
+std::string
 MicroMemOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
diff --git a/src/arch/arm/insts/macromem.hh b/src/arch/arm/insts/macromem.hh
index 003f5a3fd..923e9c0a1 100644
--- a/src/arch/arm/insts/macromem.hh
+++ b/src/arch/arm/insts/macromem.hh
@@ -80,16 +80,66 @@ class MicroOp : public PredOp
 };
 
 /**
+ * Microops for Neon loads/stores
+ */
+class MicroNeonMemOp : public MicroOp
+{
+  protected:
+    RegIndex dest, ura;
+    uint32_t imm;
+    unsigned memAccessFlags;
+
+    MicroNeonMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                   RegIndex _dest, RegIndex _ura, uint32_t _imm)
+            : MicroOp(mnem, machInst, __opClass),
+              dest(_dest), ura(_ura), imm(_imm),
+              memAccessFlags(TLB::MustBeOne)
+    {
+    }
+};
+
+/**
+ * Microops for Neon load/store (de)interleaving
+ */
+class MicroNeonMixOp : public MicroOp
+{
+  protected:
+    RegIndex dest, op1;
+    uint32_t step;
+
+    MicroNeonMixOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                   RegIndex _dest, RegIndex _op1, uint32_t _step)
+            : MicroOp(mnem, machInst, __opClass),
+              dest(_dest), op1(_op1), step(_step)
+    {
+    }
+};
+
+class MicroNeonMixLaneOp : public MicroNeonMixOp
+{
+  protected:
+    unsigned lane;
+
+    MicroNeonMixLaneOp(const char *mnem, ExtMachInst machInst,
+                       OpClass __opClass, RegIndex _dest, RegIndex _op1,
+                       uint32_t _step, unsigned _lane)
+            : MicroNeonMixOp(mnem, machInst, __opClass, _dest, _op1, _step),
+              lane(_lane)
+    {
+    }
+};
+
+/**
  * Microops of the form IntRegA = IntRegB op Imm
  */
-class MicroIntOp : public MicroOp
+class MicroIntImmOp : public MicroOp
 {
   protected:
     RegIndex ura, urb;
     uint8_t imm;
 
-    MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
-               RegIndex _ura, RegIndex _urb, uint8_t _imm)
+    MicroIntImmOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  RegIndex _ura, RegIndex _urb, uint8_t _imm)
             : MicroOp(mnem, machInst, __opClass),
               ura(_ura), urb(_urb), imm(_imm)
     {
@@ -99,9 +149,27 @@ class MicroIntOp : public MicroOp
 };
 
 /**
+ * Microops of the form IntRegA = IntRegB op IntRegC
+ */
+class MicroIntOp : public MicroOp
+{
+  protected:
+    RegIndex ura, urb, urc;
+
+    MicroIntOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+               RegIndex _ura, RegIndex _urb, RegIndex _urc)
+            : MicroOp(mnem, machInst, __opClass),
+              ura(_ura), urb(_urb), urc(_urc)
+    {
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+/**
  * Memory microops which use IntReg + Imm addressing
  */
-class MicroMemOp : public MicroIntOp
+class MicroMemOp : public MicroIntImmOp
 {
   protected:
     bool up;
@@ -109,7 +177,7 @@ class MicroMemOp : public MicroIntOp
 
     MicroMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
                RegIndex _ura, RegIndex _urb, bool _up, uint8_t _imm)
-            : MicroIntOp(mnem, machInst, __opClass, _ura, _urb, _imm),
+            : MicroIntImmOp(mnem, machInst, __opClass, _ura, _urb, _imm),
               up(_up), memAccessFlags(TLB::MustBeOne | TLB::AlignWord)
     {
     }
@@ -129,6 +197,46 @@ class MacroMemOp : public PredMacroOp
 };
 
 /**
+ * Base classes for microcoded integer memory instructions.
+ */
+class VldMultOp : public PredMacroOp
+{
+  protected:
+    VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+              unsigned elems, RegIndex rn, RegIndex vd, unsigned regs,
+              unsigned inc, uint32_t size, uint32_t align, RegIndex rm);
+};
+
+class VldSingleOp : public PredMacroOp
+{
+  protected:
+    VldSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                bool all, unsigned elems, RegIndex rn, RegIndex vd,
+                unsigned regs, unsigned inc, uint32_t size,
+                uint32_t align, RegIndex rm, unsigned lane);
+};
+
+/**
+ * Base class for microcoded integer memory instructions.
+ */
+class VstMultOp : public PredMacroOp
+{
+  protected:
+    VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+              unsigned width, RegIndex rn, RegIndex vd, unsigned regs,
+              unsigned inc, uint32_t size, uint32_t align, RegIndex rm);
+};
+
+class VstSingleOp : public PredMacroOp
+{
+  protected:
+    VstSingleOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                bool all, unsigned elems, RegIndex rn, RegIndex vd,
+                unsigned regs, unsigned inc, uint32_t size,
+                uint32_t align, RegIndex rm, unsigned lane);
+};
+
+/**
  * Base class for microcoded floating point memory instructions.
  */
 class MacroVFPMemOp : public PredMacroOp
diff --git a/src/arch/arm/insts/pred_inst.hh b/src/arch/arm/insts/pred_inst.hh
index 2cb383ad3..b7d4c4709 100644
--- a/src/arch/arm/insts/pred_inst.hh
+++ b/src/arch/arm/insts/pred_inst.hh
@@ -118,24 +118,26 @@ simd_modified_imm(bool op, uint8_t cmode, uint8_t data)
         break;
       case 0xe:
         if (op) {
-            bigData = (bigData << 0)  | (bigData << 8)  |
-                      (bigData << 16) | (bigData << 24) |
-                      (bigData << 32) | (bigData << 40) |
-                      (bigData << 48) | (bigData << 56);
-        } else {
             bigData = 0;
             for (int i = 7; i >= 0; i--) {
                 if (bits(data, i)) {
-                    bigData |= (0xFF << (i * 8));
+                    bigData |= (ULL(0xFF) << (i * 8));
                 }
             }
+        } else {
+            bigData = (bigData << 0)  | (bigData << 8)  |
+                      (bigData << 16) | (bigData << 24) |
+                      (bigData << 32) | (bigData << 40) |
+                      (bigData << 48) | (bigData << 56);
         }
+        break;
       case 0xf:
         if (!op) {
             uint64_t bVal = bits(bigData, 6) ? (0x1F) : (0x20);
             bigData = (bits(bigData, 5, 0) << 19) |
                       (bVal << 25) | (bits(bigData, 7) << 31);
             bigData |= (bigData << 32);
+            break;
         }
         // Fall through
       default:
diff --git a/src/arch/arm/insts/static_inst.hh b/src/arch/arm/insts/static_inst.hh
index 5a1993b86..e98f85a3b 100644
--- a/src/arch/arm/insts/static_inst.hh
+++ b/src/arch/arm/insts/static_inst.hh
@@ -251,6 +251,28 @@ class ArmStaticInst : public StaticInst
         }
     }
 
+    template<class T, class E>
+    static inline T
+    cSwap(T val, bool big)
+    {
+        const unsigned count = sizeof(T) / sizeof(E);
+        union {
+            T tVal;
+            E eVals[count];
+        } conv;
+        conv.tVal = htog(val);
+        if (big) {
+            for (unsigned i = 0; i < count; i++) {
+                conv.eVals[i] = gtobe(conv.eVals[i]);
+            }
+        } else {
+            for (unsigned i = 0; i < count; i++) {
+                conv.eVals[i] = gtole(conv.eVals[i]);
+            }
+        }
+        return gtoh(conv.tVal);
+    }
+
     // Perform an interworking branch.
     template<class XC>
     static inline void
diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc
index 1968a59a9..f689204d9 100644
--- a/src/arch/arm/insts/vfp.cc
+++ b/src/arch/arm/insts/vfp.cc
@@ -91,6 +91,20 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
     return ss.str();
 }
 
+std::string
+FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest + FP_Base_DepTag);
+    ss << ", ";
+    printReg(ss, op1 + FP_Base_DepTag);
+    ss << ", ";
+    printReg(ss, op2 + FP_Base_DepTag);
+    ccprintf(ss, ", #%d", imm);
+    return ss.str();
+}
+
 namespace ArmISA
 {
 
@@ -117,7 +131,7 @@ prepFpState(uint32_t rMode)
 }
 
 void
-finishVfp(FPSCR &fpscr, VfpSavedState state)
+finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush)
 {
     int exceptions = fetestexcept(FeAllExceptions);
     bool underflow = false;
@@ -134,7 +148,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state)
         underflow = true;
         fpscr.ufc = 1;
     }
-    if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) {
+    if ((exceptions & FeInexact) && !(underflow && flush)) {
         fpscr.ixc = 1;
     }
     fesetround(state);
@@ -142,7 +156,7 @@ finishVfp(FPSCR &fpscr, VfpSavedState state)
 
 template <class fpType>
 fpType
-fixDest(FPSCR fpscr, fpType val, fpType op1)
+fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
 {
     int fpClass = std::fpclassify(val);
     fpType junk = 0.0;
@@ -150,12 +164,12 @@ fixDest(FPSCR fpscr, fpType val, fpType op1)
         const bool single = (sizeof(val) == sizeof(float));
         const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
         const bool nan = std::isnan(op1);
-        if (!nan || (fpscr.dn == 1)) {
+        if (!nan || defaultNan) {
             val = bitsToFp(qnan, junk);
         } else if (nan) {
             val = bitsToFp(fpToBits(op1) | qnan, junk);
         }
-    } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) {
+    } else if (fpClass == FP_SUBNORMAL && flush == 1) {
         // Turn val into a zero with the correct sign;
         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
         val = bitsToFp(fpToBits(val) & bitMask, junk);
@@ -166,13 +180,13 @@ fixDest(FPSCR fpscr, fpType val, fpType op1)
 }
 
 template
-float fixDest<float>(FPSCR fpscr, float val, float op1);
+float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
 template
-double fixDest<double>(FPSCR fpscr, double val, double op1);
+double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
 
 template <class fpType>
 fpType
-fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
+fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 {
     int fpClass = std::fpclassify(val);
     fpType junk = 0.0;
@@ -183,7 +197,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
         const bool nan2 = std::isnan(op2);
         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
-        if ((!nan1 && !nan2) || (fpscr.dn == 1)) {
+        if ((!nan1 && !nan2) || defaultNan) {
             val = bitsToFp(qnan, junk);
         } else if (signal1) {
             val = bitsToFp(fpToBits(op1) | qnan, junk);
@@ -194,7 +208,7 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
         } else if (nan2) {
             val = op2;
         }
-    } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) {
+    } else if (fpClass == FP_SUBNORMAL && flush) {
         // Turn val into a zero with the correct sign;
         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
         val = bitsToFp(fpToBits(val) & bitMask, junk);
@@ -205,15 +219,17 @@ fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
 }
 
 template
-float fixDest<float>(FPSCR fpscr, float val, float op1, float op2);
+float fixDest<float>(bool flush, bool defaultNan,
+                     float val, float op1, float op2);
 template
-double fixDest<double>(FPSCR fpscr, double val, double op1, double op2);
+double fixDest<double>(bool flush, bool defaultNan,
+                       double val, double op1, double op2);
 
 template <class fpType>
 fpType
-fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
+fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 {
-    fpType mid = fixDest(fpscr, val, op1, op2);
+    fpType mid = fixDest(flush, defaultNan, val, op1, op2);
     const bool single = (sizeof(fpType) == sizeof(float));
     const fpType junk = 0.0;
     if ((single && (val == bitsToFp(0x00800000, junk) ||
@@ -228,7 +244,7 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
         temp = op1 / op2;
         if (flushToZero(temp)) {
             feraiseexcept(FeUnderflow);
-            if (fpscr.fz) {
+            if (flush) {
                 feclearexcept(FeInexact);
                 mid = temp;
             }
@@ -239,9 +255,11 @@ fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2)
 }
 
 template
-float fixDivDest<float>(FPSCR fpscr, float val, float op1, float op2);
+float fixDivDest<float>(bool flush, bool defaultNan,
+                        float val, float op1, float op2);
 template
-double fixDivDest<double>(FPSCR fpscr, double val, double op1, double op2);
+double fixDivDest<double>(bool flush, bool defaultNan,
+                          double val, double op1, double op2);
 
 float
 fixFpDFpSDest(FPSCR fpscr, double val)
@@ -255,7 +273,7 @@ fixFpDFpSDest(FPSCR fpscr, double val)
                            (bits(valBits, 63) << 31);
         op1 = bitsToFp(op1Bits, junk);
     }
-    float mid = fixDest(fpscr, (float)val, op1);
+    float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
     if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
                     (FeUnderflow | FeInexact)) {
         feclearexcept(FeInexact);
@@ -291,7 +309,7 @@ fixFpSFpDDest(FPSCR fpscr, float val)
                            ((uint64_t)bits(valBits, 31) << 63);
         op1 = bitsToFp(op1Bits, junk);
     }
-    double mid = fixDest(fpscr, (double)val, op1);
+    double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
     if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
         mid == bitsToFp(ULL(0x8010000000000000), junk)) {
         __asm__ __volatile__("" : "=m" (val) : "m" (val));
@@ -311,11 +329,10 @@ fixFpSFpDDest(FPSCR fpscr, float val)
     return mid;
 }
 
-float
-vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
+uint16_t
+vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+           uint32_t rMode, bool ahp, float op)
 {
-    float junk = 0.0;
-    uint32_t destBits = fpToBits(dest);
     uint32_t opBits = fpToBits(op);
     // Extract the operand.
     bool neg = bits(opBits, 31);
@@ -331,11 +348,11 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
                 // Signalling nan.
                 fpscr.ioc = 1;
             }
-            if (fpscr.ahp) {
+            if (ahp) {
                 mantissa = 0;
                 exponent = 0;
                 fpscr.ioc = 1;
-            } else if (fpscr.dn) {
+            } else if (defaultNan) {
                 mantissa = (1 << 9);
                 exponent = 0x1f;
                 neg = false;
@@ -346,7 +363,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
         } else {
             // Infinities.
             exponent = 0x1F;
-            if (fpscr.ahp) {
+            if (ahp) {
                 fpscr.ioc = 1;
                 mantissa = 0x3ff;
             } else {
@@ -364,14 +381,14 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
             // Denormalized.
 
             // If flush to zero is on, this shouldn't happen.
-            assert(fpscr.fz == 0);
+            assert(!flush);
 
             // Check for underflow
             if (inexact || fpscr.ufe)
                 fpscr.ufc = 1;
 
             // Handle rounding.
-            unsigned mode = fpscr.rMode;
+            unsigned mode = rMode;
             if ((mode == VfpRoundUpward && !neg && extra) ||
                 (mode == VfpRoundDown && neg && extra) ||
                 (mode == VfpRoundNearest &&
@@ -416,7 +433,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
             }
 
             // Handle rounding.
-            unsigned mode = fpscr.rMode;
+            unsigned mode = rMode;
             bool nonZero = topOne || !restZeros;
             if ((mode == VfpRoundUpward && !neg && nonZero) ||
                 (mode == VfpRoundDown && neg && nonZero) ||
@@ -432,7 +449,7 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
             }
 
             // Deal with overflow
-            if (fpscr.ahp) {
+            if (ahp) {
                 if (exponent >= 0x20) {
                     exponent = 0x1f;
                     mantissa = 0x3ff;
@@ -468,27 +485,17 @@ vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
     replaceBits(result, 14, 10, exponent);
     if (neg)
         result |= (1 << 15);
-    if (top)
-        replaceBits(destBits, 31, 16, result);
-    else
-        replaceBits(destBits, 15, 0, result);
-    return bitsToFp(destBits, junk);
+    return result;
 }
 
 float
-vcvtFpHFpS(FPSCR &fpscr, float op, bool top)
+vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 {
     float junk = 0.0;
-    uint32_t opBits = fpToBits(op);
-    // Extract the operand.
-    if (top)
-        opBits = bits(opBits, 31, 16);
-    else
-        opBits = bits(opBits, 15, 0);
     // Extract the bitfields.
-    bool neg = bits(opBits, 15);
-    uint32_t exponent = bits(opBits, 14, 10);
-    uint32_t mantissa = bits(opBits, 9, 0);
+    bool neg = bits(op, 15);
+    uint32_t exponent = bits(op, 14, 10);
+    uint32_t mantissa = bits(op, 9, 0);
     // Do the conversion.
     if (exponent == 0) {
         if (mantissa != 0) {
@@ -500,7 +507,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top)
             }
         }
         mantissa = mantissa << (23 - 10);
-    } else if (exponent == 0x1f && !fpscr.ahp) {
+    } else if (exponent == 0x1f && !ahp) {
         // Infinities and nans.
         exponent = 0xff;
         if (mantissa != 0) {
@@ -511,7 +518,7 @@ vcvtFpHFpS(FPSCR &fpscr, float op, bool top)
                 fpscr.ioc = 1;
                 mantissa |= (1 << 22);
             }
-            if (fpscr.dn) {
+            if (defaultNan) {
                 mantissa &= ~mask(22);
                 neg = false;
             }
@@ -624,7 +631,8 @@ vfpFpSToFixed(float val, bool isSigned, bool half,
 }
 
 float
-vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
+vfpUFixedToFpS(bool flush, bool defaultNan,
+        uint32_t val, bool half, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (half)
@@ -633,11 +641,12 @@ vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
-    return fixDivDest(fpscr, val / scale, (float)val, scale);
+    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 }
 
 float
-vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
+vfpSFixedToFpS(bool flush, bool defaultNan,
+        int32_t val, bool half, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (half)
@@ -646,7 +655,7 @@ vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
-    return fixDivDest(fpscr, val / scale, (float)val, scale);
+    return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 }
 
 uint64_t
@@ -743,7 +752,8 @@ vfpFpDToFixed(double val, bool isSigned, bool half,
 }
 
 double
-vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
+vfpUFixedToFpD(bool flush, bool defaultNan,
+        uint32_t val, bool half, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (half)
@@ -752,11 +762,12 @@ vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm)
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
-    return fixDivDest(fpscr, val / scale, (double)val, scale);
+    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 }
 
 double
-vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
+vfpSFixedToFpD(bool flush, bool defaultNan,
+        int32_t val, bool half, uint8_t imm)
 {
     fesetround(FeRoundNearest);
     if (half)
@@ -765,14 +776,211 @@ vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm)
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
-    return fixDivDest(fpscr, val / scale, (double)val, scale);
+    return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
+}
+
+// This function implements a magic formula taken from the architecture
+// reference manual. It was originally called recip_sqrt_estimate.
+static double
+recipSqrtEstimate(double a)
+{
+    int64_t q0, q1, s;
+    double r;
+    if (a < 0.5) {
+        q0 = (int64_t)(a * 512.0);
+        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
+    } else {
+        q1 = (int64_t)(a * 256.0);
+        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
+    }
+    s = (int64_t)(256.0 * r + 0.5);
+    return (double)s / 256.0;
 }
 
+// This function is only intended for use in Neon instructions because
+// it ignores certain bits in the FPSCR.
+float
+fprSqrtEstimate(FPSCR &fpscr, float op)
+{
+    const uint32_t qnan = 0x7fc00000;
+    float junk = 0.0;
+    int fpClass = std::fpclassify(op);
+    if (fpClass == FP_NAN) {
+        if ((fpToBits(op) & qnan) != qnan)
+            fpscr.ioc = 1;
+        return bitsToFp(qnan, junk);
+    } else if (fpClass == FP_ZERO) {
+        fpscr.dzc = 1;
+        // Return infinity with the same sign as the operand.
+        return bitsToFp((std::signbit(op) << 31) |
+                       (0xFF << 23) | (0 << 0), junk);
+    } else if (std::signbit(op)) {
+        // Set invalid op bit.
+        fpscr.ioc = 1;
+        return bitsToFp(qnan, junk);
+    } else if (fpClass == FP_INFINITE) {
+        return 0.0;
+    } else {
+        uint64_t opBits = fpToBits(op);
+        double scaled;
+        if (bits(opBits, 23)) {
+            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
+                              (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
+                              (double)0.0);
+        } else {
+            scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
+                              (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
+                              (double)0.0);
+        }
+        uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
+
+        uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
+
+        return bitsToFp((bits(estimate, 63) << 31) |
+                        (bits(resultExp, 7, 0) << 23) |
+                        (bits(estimate, 51, 29) << 0), junk);
+    }
+}
+
+uint32_t
+unsignedRSqrtEstimate(uint32_t op)
+{
+    if (bits(op, 31, 30) == 0) {
+        return -1;
+    } else {
+        double dpOp;
+        if (bits(op, 31)) {
+            dpOp = bitsToFp((ULL(0) << 63) |
+                            (ULL(0x3fe) << 52) |
+                            (bits((uint64_t)op, 30, 0) << 21) |
+                            (0 << 0), (double)0.0);
+        } else {
+            dpOp = bitsToFp((ULL(0) << 63) |
+                            (ULL(0x3fd) << 52) |
+                            (bits((uint64_t)op, 29, 0) << 22) |
+                            (0 << 0), (double)0.0);
+        }
+        uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
+        return (1 << 31) | bits(estimate, 51, 21);
+    }
+}
+
+// This function implements a magic formula taken from the architecture
+// reference manual. It was originally called recip_estimate.
+
+static double
+recipEstimate(double a)
+{
+    int64_t q, s;
+    double r;
+    q = (int64_t)(a * 512.0);
+    r = 1.0 / (((double)q + 0.5) / 512.0);
+    s = (int64_t)(256.0 * r + 0.5);
+    return (double)s / 256.0;
+}
+
+// This function is only intended for use in Neon instructions because
+// it ignores certain bits in the FPSCR.
+float
+fpRecipEstimate(FPSCR &fpscr, float op)
+{
+    const uint32_t qnan = 0x7fc00000;
+    float junk = 0.0;
+    int fpClass = std::fpclassify(op);
+    if (fpClass == FP_NAN) {
+        if ((fpToBits(op) & qnan) != qnan)
+            fpscr.ioc = 1;
+        return bitsToFp(qnan, junk);
+    } else if (fpClass == FP_INFINITE) {
+        return bitsToFp(std::signbit(op) << 31, junk);
+    } else if (fpClass == FP_ZERO) {
+        fpscr.dzc = 1;
+        // Return infinity with the same sign as the operand.
+        return bitsToFp((std::signbit(op) << 31) |
+                       (0xFF << 23) | (0 << 0), junk);
+    } else if (fabs(op) >= pow(2.0, 126)) {
+        fpscr.ufc = 1;
+        return bitsToFp(std::signbit(op) << 31, junk);
+    } else {
+        uint64_t opBits = fpToBits(op);
+        double scaled;
+        scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
+                          (ULL(0x3fe) << 52) | (ULL(0) << 63),
+                          (double)0.0);
+        uint64_t resultExp = 253 - bits(opBits, 30, 23);
+
+        uint64_t estimate = fpToBits(recipEstimate(scaled));
+
+        return bitsToFp((bits(opBits, 31) << 31) |
+                        (bits(resultExp, 7, 0) << 23) |
+                        (bits(estimate, 51, 29) << 0), junk);
+    }
+}
+
+uint32_t
+unsignedRecipEstimate(uint32_t op)
+{
+    if (bits(op, 31) == 0) {
+        return -1;
+    } else {
+        double dpOp;
+        dpOp = bitsToFp((ULL(0) << 63) |
+                        (ULL(0x3fe) << 52) |
+                        (bits((uint64_t)op, 30, 0) << 21) |
+                        (0 << 0), (double)0.0);
+        uint64_t estimate = fpToBits(recipEstimate(dpOp));
+        return (1 << 31) | bits(estimate, 51, 21);
+    }
+}
+
+template <class fpType>
+fpType
+FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
+                  fpType op1, fpType op2) const
+{
+    done = true;
+    fpType junk = 0.0;
+    fpType dest = 0.0;
+    const bool single = (sizeof(fpType) == sizeof(float));
+    const uint64_t qnan =
+        single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+    const bool nan1 = std::isnan(op1);
+    const bool nan2 = std::isnan(op2);
+    const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
+    const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
+    if (nan1 || nan2) {
+        if (defaultNan) {
+            dest = bitsToFp(qnan, junk);
+        }  else if (signal1) {
+            dest = bitsToFp(fpToBits(op1) | qnan, junk);
+        } else if (signal2) {
+            dest = bitsToFp(fpToBits(op2) | qnan, junk);
+        } else if (nan1) {
+            dest = op1;
+        } else if (nan2) {
+            dest = op2;
+        }
+        if (signal1 || signal2) {
+            fpscr.ioc = 1;
+        }
+    } else {
+        done = false;
+    }
+    return dest;
+}
+
+template
+float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
+                        float op1, float op2) const;
+template
+double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
+                         double op1, double op2) const;
+
 template <class fpType>
 fpType
 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
                fpType (*func)(fpType, fpType),
-               bool flush, uint32_t rMode) const
+               bool flush, bool defaultNan, uint32_t rMode) const
 {
     const bool single = (sizeof(fpType) == sizeof(float));
     fpType junk = 0.0;
@@ -795,7 +1003,7 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
         const bool nan2 = std::isnan(op2);
         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
-        if ((!nan1 && !nan2) || (fpscr.dn == 1)) {
+        if ((!nan1 && !nan2) || (defaultNan == 1)) {
             dest = bitsToFp(qnan, junk);
         } else if (signal1) {
             dest = bitsToFp(fpToBits(op1) | qnan, junk);
@@ -828,18 +1036,18 @@ FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
             dest = temp;
         }
     }
-    finishVfp(fpscr, state);
+    finishVfp(fpscr, state, flush);
     return dest;
 }
 
 template
 float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
                      float (*func)(float, float),
-                     bool flush, uint32_t rMode) const;
+                     bool flush, bool defaultNan, uint32_t rMode) const;
 template
 double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
                       double (*func)(double, double),
-                      bool flush, uint32_t rMode) const;
+                      bool flush, bool defaultNan, uint32_t rMode) const;
 
 template <class fpType>
 fpType
@@ -890,7 +1098,7 @@ FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
             dest = temp;
         }
     }
-    finishVfp(fpscr, state);
+    finishVfp(fpscr, state, flush);
     return dest;
 }
 
diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh
index 57636bbfc..964b62673 100644
--- a/src/arch/arm/insts/vfp.hh
+++ b/src/arch/arm/insts/vfp.hh
@@ -192,10 +192,20 @@ bitsToFp(uint64_t bits, double junk)
     return val.fp;
 }
 
+template <class fpType>
+static bool
+isSnan(fpType val)
+{
+    const bool single = (sizeof(fpType) == sizeof(float));
+    const uint64_t qnan =
+        single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+    return std::isnan(val) && ((fpToBits(val) & qnan) != qnan);
+}
+
 typedef int VfpSavedState;
 
 VfpSavedState prepFpState(uint32_t rMode);
-void finishVfp(FPSCR &fpscr, VfpSavedState state);
+void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush);
 
 template <class fpType>
 fpType fixDest(FPSCR fpscr, fpType val, fpType op1);
@@ -209,8 +219,9 @@ fpType fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2);
 float fixFpDFpSDest(FPSCR fpscr, double val);
 double fixFpSFpDDest(FPSCR fpscr, float val);
 
-float vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top);
-float vcvtFpHFpS(FPSCR &fpscr, float op, bool top);
+uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+                    uint32_t rMode, bool ahp, float op);
+float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
 
 static inline double
 makeDouble(uint32_t low, uint32_t high)
@@ -233,13 +244,23 @@ highFromDouble(double val)
 
 uint64_t vfpFpSToFixed(float val, bool isSigned, bool half,
                        uint8_t imm, bool rzero = true);
-float vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm);
-float vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm);
+float vfpUFixedToFpS(bool flush, bool defaultNan,
+        uint32_t val, bool half, uint8_t imm);
+float vfpSFixedToFpS(bool flush, bool defaultNan,
+        int32_t val, bool half, uint8_t imm);
 
 uint64_t vfpFpDToFixed(double val, bool isSigned, bool half,
                        uint8_t imm, bool rzero = true);
-double vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm);
-double vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm);
+double vfpUFixedToFpD(bool flush, bool defaultNan,
+        uint32_t val, bool half, uint8_t imm);
+double vfpSFixedToFpD(bool flush, bool defaultNan,
+        int32_t val, bool half, uint8_t imm);
+
+float fprSqrtEstimate(FPSCR &fpscr, float op);
+uint32_t unsignedRSqrtEstimate(uint32_t op);
+
+float fpRecipEstimate(FPSCR &fpscr, float op);
+uint32_t unsignedRecipEstimate(uint32_t op);
 
 class VfpMacroOp : public PredMacroOp
 {
@@ -312,6 +333,66 @@ fpMulD(double a, double b)
     return a * b;
 }
 
+static inline float
+fpMaxS(float a, float b)
+{
+    // Handle comparisons of +0 and -0.
+    if (!std::signbit(a) && std::signbit(b))
+        return a;
+    return fmaxf(a, b);
+}
+
+static inline float
+fpMinS(float a, float b)
+{
+    // Handle comparisons of +0 and -0.
+    if (std::signbit(a) && !std::signbit(b))
+        return a;
+    return fminf(a, b);
+}
+
+static inline float
+fpRSqrtsS(float a, float b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    float aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 1.5;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 1.5;
+    }
+    return (3.0 - (a * b)) / 2.0;
+}
+
+static inline float
+fpRecpsS(float a, float b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    float aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 2.0;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 2.0;
+    }
+    return 2.0 - (a * b);
+}
+
 class FpOp : public PredOp
 {
   protected:
@@ -364,9 +445,14 @@ class FpOp : public PredOp
 
     template <class fpType>
     fpType
+    processNans(FPSCR &fpscr, bool &done, bool defaultNan,
+                fpType op1, fpType op2) const;
+
+    template <class fpType>
+    fpType
     binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
             fpType (*func)(fpType, fpType),
-            bool flush, uint32_t rMode) const;
+            bool flush, bool defaultNan, uint32_t rMode) const;
 
     template <class fpType>
     fpType
@@ -445,6 +531,27 @@ class FpRegRegRegOp : public FpOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class FpRegRegRegImmOp : public FpOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    IntRegIndex op2;
+    uint64_t imm;
+
+    FpRegRegRegImmOp(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, IntRegIndex _dest,
+                     IntRegIndex _op1, IntRegIndex _op2,
+                     uint64_t _imm, VfpMicroMode mode = VfpNotAMicroop) :
+        FpOp(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), imm(_imm)
+    {
+        setVfpMicroFlags(mode, flags);
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 }
 
 #endif //__ARCH_ARM_INSTS_VFP_HH__
author	Gabe Black <gblack@eecs.umich.edu>	2010-08-25 19:10:42 -0500
committer	Gabe Black <gblack@eecs.umich.edu>	2010-08-25 19:10:42 -0500
commit	6368edb281f162e4fbb0a91744992a25134135f4 (patch)
tree	e84dfa7d10903e6c7a56e01cc6ca23f4b0d41908 /src/arch/arm/insts
parent	f4f6b31df1a8787a12d71108eac24543bdf541e3 (diff)
download	gem5-6368edb281f162e4fbb0a91744992a25134135f4.tar.xz