21 files changed, 6761 insertions, 371 deletions
diff --git a/src/arch/arm/insts/branch64.cc b/src/arch/arm/insts/branch64.cc
new file mode 100644
index 000000000..49ba3402a
--- /dev/null
+++ b/src/arch/arm/insts/branch64.cc
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#include "arch/arm/insts/branch64.hh"
+
+namespace ArmISA
+{
+
+ArmISA::PCState
+BranchImm64::branchTarget(const ArmISA::PCState &branchPC) const
+{
+    ArmISA::PCState pcs = branchPC;
+    pcs.instNPC(pcs.pc() + imm);
+    pcs.advance();
+    return pcs;
+}
+
+ArmISA::PCState
+BranchImmReg64::branchTarget(const ArmISA::PCState &branchPC) const
+{
+    ArmISA::PCState pcs = branchPC;
+    pcs.instNPC(pcs.pc() + imm);
+    pcs.advance();
+    return pcs;
+}
+
+ArmISA::PCState
+BranchImmImmReg64::branchTarget(const ArmISA::PCState &branchPC) const
+{
+    ArmISA::PCState pcs = branchPC;
+    pcs.instNPC(pcs.pc() + imm2);
+    pcs.advance();
+    return pcs;
+}
+
+std::string
+BranchImmCond64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false, true, condCode);
+    printTarget(ss, pc + imm, symtab);
+    return ss.str();
+}
+
+std::string
+BranchImm64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printTarget(ss, pc + imm, symtab);
+    return ss.str();
+}
+
+std::string
+BranchReg64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    return ss.str();
+}
+
+std::string
+BranchRet64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    if (op1 != INTREG_X30)
+        printReg(ss, op1);
+    return ss.str();
+}
+
+std::string
+BranchEret64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    return ss.str();
+}
+
+std::string
+BranchImmReg64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printTarget(ss, pc + imm, symtab);
+    return ss.str();
+}
+
+std::string
+BranchImmImmReg64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    ccprintf(ss, ", #%#x, ", imm1);
+    printTarget(ss, pc + imm2, symtab);
+    return ss.str();
+}
+
+} // namespace ArmISA
diff --git a/src/arch/arm/insts/branch64.hh b/src/arch/arm/insts/branch64.hh
new file mode 100644
index 000000000..48881e0c2
--- /dev/null
+++ b/src/arch/arm/insts/branch64.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+#ifndef __ARCH_ARM_INSTS_BRANCH64_HH__
+#define __ARCH_ARM_INSTS_BRANCH64_HH__
+
+#include "arch/arm/insts/static_inst.hh"
+
+namespace ArmISA
+{
+// Branch to a target computed with an immediate
+class BranchImm64 : public ArmStaticInst
+{
+  protected:
+    int64_t imm;
+
+  public:
+    BranchImm64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                int64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass), imm(_imm)
+    {}
+
+    ArmISA::PCState branchTarget(const ArmISA::PCState &branchPC) const;
+
+    /// Explicitly import the otherwise hidden branchTarget
+    using StaticInst::branchTarget;
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Conditionally Branch to a target computed with an immediate
+class BranchImmCond64 : public BranchImm64
+{
+  protected:
+    ConditionCode condCode;
+
+  public:
+    BranchImmCond64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                    int64_t _imm, ConditionCode _condCode) :
+        BranchImm64(mnem, _machInst, __opClass, _imm), condCode(_condCode)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Branch to a target computed with a register
+class BranchReg64 : public ArmStaticInst
+{
+  protected:
+    IntRegIndex op1;
+
+  public:
+    BranchReg64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _op1) :
+        ArmStaticInst(mnem, _machInst, __opClass), op1(_op1)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Ret instruction
+class BranchRet64 : public BranchReg64
+{
+  public:
+    BranchRet64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _op1) :
+        BranchReg64(mnem, _machInst, __opClass, _op1)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Eret instruction
+class BranchEret64 : public ArmStaticInst
+{
+  public:
+    BranchEret64(const char *mnem, ExtMachInst _machInst, OpClass __opClass) :
+        ArmStaticInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Branch to a target computed with an immediate and a register
+class BranchImmReg64 : public ArmStaticInst
+{
+  protected:
+    int64_t imm;
+    IntRegIndex op1;
+
+  public:
+    BranchImmReg64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                   int64_t _imm, IntRegIndex _op1) :
+        ArmStaticInst(mnem, _machInst, __opClass), imm(_imm), op1(_op1)
+    {}
+
+    ArmISA::PCState branchTarget(const ArmISA::PCState &branchPC) const;
+
+    /// Explicitly import the otherwise hidden branchTarget
+    using StaticInst::branchTarget;
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+// Branch to a target computed with two immediates
+class BranchImmImmReg64 : public ArmStaticInst
+{
+  protected:
+    int64_t imm1;
+    int64_t imm2;
+    IntRegIndex op1;
+
+  public:
+    BranchImmImmReg64(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, int64_t _imm1, int64_t _imm2,
+                      IntRegIndex _op1) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        imm1(_imm1), imm2(_imm2), op1(_op1)
+    {}
+
+    ArmISA::PCState branchTarget(const ArmISA::PCState &branchPC) const;
+
+    /// Explicitly import the otherwise hidden branchTarget
+    using StaticInst::branchTarget;
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+}
+
+#endif //__ARCH_ARM_INSTS_BRANCH_HH__
diff --git a/src/arch/arm/insts/data64.cc b/src/arch/arm/insts/data64.cc
new file mode 100644
index 000000000..f65219870
--- /dev/null
+++ b/src/arch/arm/insts/data64.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#include "arch/arm/insts/data64.hh"
+
+namespace ArmISA
+{
+
+std::string
+DataXImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printDataInst(ss, true, false, /*XXX not really s*/ false, dest, op1,
+                  INTREG_ZERO, INTREG_ZERO, 0, LSL, imm);
+    return ss.str();
+}
+
+std::string
+DataXImmOnlyOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", #%d", imm);
+    return ss.str();
+}
+
+std::string
+DataXSRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printDataInst(ss, false, true, /*XXX not really s*/ false, dest, op1,
+                  op2, INTREG_ZERO, shiftAmt, shiftType, 0);
+    return ss.str();
+}
+
+std::string
+DataXERegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printDataInst(ss, false, true, /*XXX not really s*/ false, dest, op1,
+                  op2, INTREG_ZERO, shiftAmt, LSL, 0);
+    return ss.str();
+}
+
+std::string
+DataX1RegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    return ss.str();
+}
+
+std::string
+DataX1RegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", #%d", imm);
+    return ss.str();
+}
+
+std::string
+DataX1Reg2ImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", #%d, #%d", imm1, imm2);
+    return ss.str();
+}
+
+std::string
+DataX2RegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    return ss.str();
+}
+
+std::string
+DataX2RegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", #%d", imm);
+    return ss.str();
+}
+
+std::string
+DataX3RegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", ");
+    printReg(ss, op3);
+    return ss.str();
+}
+
+std::string
+DataXCondCompImmOp::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    ccprintf(ss, ", #%d, #%d", imm, defCc);
+    ccprintf(ss, ", ");
+    printCondition(ss, condCode, true);
+    return ss.str();
+}
+
+std::string
+DataXCondCompRegOp::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", #%d", defCc);
+    ccprintf(ss, ", ");
+    printCondition(ss, condCode, true);
+    return ss.str();
+}
+
+std::string
+DataXCondSelOp::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", ");
+    printCondition(ss, condCode, true);
+    return ss.str();
+}
+
+}
diff --git a/src/arch/arm/insts/data64.hh b/src/arch/arm/insts/data64.hh
new file mode 100644
index 000000000..8c0677b3d
--- /dev/null
+++ b/src/arch/arm/insts/data64.hh
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+#ifndef __ARCH_ARM_INSTS_DATA64_HH__
+#define __ARCH_ARM_INSTS_DATA64_HH__
+
+#include "arch/arm/insts/static_inst.hh"
+#include "base/trace.hh"
+
+namespace ArmISA
+{
+
+class DataXImmOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1;
+    uint64_t imm;
+
+    DataXImmOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+               IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXImmOnlyOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest;
+    uint64_t imm;
+
+    DataXImmOnlyOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                   IntRegIndex _dest, uint64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXSRegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    int32_t shiftAmt;
+    ArmShiftType shiftType;
+
+    DataXSRegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                int32_t _shiftAmt, ArmShiftType _shiftType) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2),
+        shiftAmt(_shiftAmt), shiftType(_shiftType)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXERegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    ArmExtendType extendType;
+    int32_t shiftAmt;
+
+    DataXERegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                ArmExtendType _extendType, int32_t _shiftAmt) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2),
+        extendType(_extendType), shiftAmt(_shiftAmt)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX1RegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1;
+
+    DataX1RegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1) :
+        ArmStaticInst(mnem, _machInst, __opClass), dest(_dest), op1(_op1)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX1RegImmOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1;
+    uint64_t imm;
+
+    DataX1RegImmOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                   IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass), dest(_dest), op1(_op1),
+        imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX1Reg2ImmOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1;
+    uint64_t imm1, imm2;
+
+    DataX1Reg2ImmOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                    IntRegIndex _dest, IntRegIndex _op1, uint64_t _imm1,
+                    uint64_t _imm2) :
+        ArmStaticInst(mnem, _machInst, __opClass), dest(_dest), op1(_op1),
+        imm1(_imm1), imm2(_imm2)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX2RegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+
+    DataX2RegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX2RegImmOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    uint64_t imm;
+
+    DataX2RegImmOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                   IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                   uint64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataX3RegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2, op3;
+
+    DataX3RegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                IntRegIndex _op3) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), op3(_op3)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXCondCompImmOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex op1;
+    uint64_t imm;
+    ConditionCode condCode;
+    uint8_t defCc;
+
+    DataXCondCompImmOp(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, IntRegIndex _op1, uint64_t _imm,
+                      ConditionCode _condCode, uint8_t _defCc) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        op1(_op1), imm(_imm), condCode(_condCode), defCc(_defCc)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXCondCompRegOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex op1, op2;
+    ConditionCode condCode;
+    uint8_t defCc;
+
+    DataXCondCompRegOp(const char *mnem, ExtMachInst _machInst,
+                       OpClass __opClass, IntRegIndex _op1, IntRegIndex _op2,
+                       ConditionCode _condCode, uint8_t _defCc) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        op1(_op1), op2(_op2), condCode(_condCode), defCc(_defCc)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class DataXCondSelOp : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    ConditionCode condCode;
+
+    DataXCondSelOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                   IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                   ConditionCode _condCode) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), condCode(_condCode)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+}
+
+#endif //__ARCH_ARM_INSTS_PREDINST_HH__
diff --git a/src/arch/arm/insts/fplib.cc b/src/arch/arm/insts/fplib.cc
new file mode 100644
index 000000000..1f44eed09
--- /dev/null
+++ b/src/arch/arm/insts/fplib.cc
@@ -0,0 +1,3086 @@
+/*
+* Copyright (c) 2012-2013 ARM Limited
+* All rights reserved
+*
+* The license below extends only to copyright in the software and shall
+* not be construed as granting a license to any other intellectual
+* property including but not limited to intellectual property relating
+* to a hardware implementation of the functionality of the software
+* licensed hereunder.  You may use the software subject to the license
+* terms below provided that you ensure that this notice is replicated
+* unmodified and in its entirety in all distributions of the software,
+* modified or unmodified, in source code or in binary form.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer;
+* redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the distribution;
+* neither the name of the copyright holders nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+* Authors: Edmund Grimley Evans
+*          Thomas Grocutt
+*/
+
+#include <stdint.h>
+
+#include <cassert>
+
+#include "fplib.hh"
+
+namespace ArmISA
+{
+
+#define FPLIB_RN 0
+#define FPLIB_RP 1
+#define FPLIB_RM 2
+#define FPLIB_RZ 3
+#define FPLIB_FZ 4
+#define FPLIB_DN 8
+#define FPLIB_AHP 16
+
+#define FPLIB_IDC 128 // Input Denormal
+#define FPLIB_IXC 16  // Inexact
+#define FPLIB_UFC 8   // Underflow
+#define FPLIB_OFC 4   // Overflow
+#define FPLIB_DZC 2   // Division by Zero
+#define FPLIB_IOC 1   // Invalid Operation
+
+static inline uint16_t
+lsl16(uint16_t x, uint32_t shift)
+{
+    return shift < 16 ? x << shift : 0;
+}
+
+static inline uint16_t
+lsr16(uint16_t x, uint32_t shift)
+{
+    return shift < 16 ? x >> shift : 0;
+}
+
+static inline uint32_t
+lsl32(uint32_t x, uint32_t shift)
+{
+    return shift < 32 ? x << shift : 0;
+}
+
+static inline uint32_t
+lsr32(uint32_t x, uint32_t shift)
+{
+    return shift < 32 ? x >> shift : 0;
+}
+
+static inline uint64_t
+lsl64(uint64_t x, uint32_t shift)
+{
+    return shift < 64 ? x << shift : 0;
+}
+
+static inline uint64_t
+lsr64(uint64_t x, uint32_t shift)
+{
+    return shift < 64 ? x >> shift : 0;
+}
+
+static inline void
+lsl128(uint64_t *r0, uint64_t *r1, uint64_t x0, uint64_t x1, uint32_t shift)
+{
+    if (shift < 64) {
+        *r1 = x1 << shift | x0 >> (64 - shift);
+        *r0 = x0 << shift;
+    } else if (shift < 128) {
+        *r1 = x0 << (shift - 64);
+        *r0 = 0;
+    } else {
+        *r1 = 0;
+        *r0 = 0;
+    }
+}
+
+static inline void
+lsr128(uint64_t *r0, uint64_t *r1, uint64_t x0, uint64_t x1, uint32_t shift)
+{
+    if (shift < 64) {
+        *r0 = x0 >> shift | x1 << (64 - shift);
+        *r1 = x1 >> shift;
+    } else if (shift < 128) {
+        *r0 = x1 >> (shift - 64);
+        *r1 = 0;
+    } else {
+        *r0 = 0;
+        *r1 = 0;
+    }
+}
+
+static inline void
+mul62x62(uint64_t *x0, uint64_t *x1, uint64_t a, uint64_t b)
+{
+    uint32_t mask = ((uint32_t)1 << 31) - 1;
+    uint64_t a0 = a & mask;
+    uint64_t a1 = a >> 31 & mask;
+    uint64_t b0 = b & mask;
+    uint64_t b1 = b >> 31 & mask;
+    uint64_t p0 = a0 * b0;
+    uint64_t p2 = a1 * b1;
+    uint64_t p1 = (a0 + a1) * (b0 + b1) - p0 - p2;
+    uint64_t s0 = p0;
+    uint64_t s1 = (s0 >> 31) + p1;
+    uint64_t s2 = (s1 >> 31) + p2;
+    *x0 = (s0 & mask) | (s1 & mask) << 31 | s2 << 62;
+    *x1 = s2 >> 2;
+}
+
+static inline
+void mul64x32(uint64_t *x0, uint64_t *x1, uint64_t a, uint32_t b)
+{
+    uint64_t t0 = (uint64_t)(uint32_t)a * b;
+    uint64_t t1 = (t0 >> 32) + (a >> 32) * b;
+    *x0 = t1 << 32 | (uint32_t)t0;
+    *x1 = t1 >> 32;
+}
+
+static inline void
+mul64x64(uint64_t *x0, uint64_t *x1, uint64_t a, uint64_t b)
+{
+    uint64_t a0 = (uint32_t)a;
+    uint64_t a1 = a >> 32;
+    uint64_t b0 = (uint32_t)b;
+    uint64_t b1 = b >> 32;
+    uint64_t t1 = (a0 * b0 >> 32) + a1 * b0;
+    uint64_t t2 = a0 * b1;
+    uint64_t x = ((uint64_t)(uint32_t)t1 + (uint32_t)t2) >> 32;
+    x += t1 >> 32;
+    x += t2 >> 32;
+    x += a1 * b1;
+    *x0 = a * b;
+    *x1 = x;
+}
+
+static inline void
+add128(uint64_t *x0, uint64_t *x1, uint64_t a0, uint64_t a1, uint64_t b0,
+       uint64_t b1)
+{
+    *x0 = a0 + b0;
+    *x1 = a1 + b1 + (*x0 < a0);
+}
+
+static inline void
+sub128(uint64_t *x0, uint64_t *x1, uint64_t a0, uint64_t a1, uint64_t b0,
+       uint64_t b1)
+{
+    *x0 = a0 - b0;
+    *x1 = a1 - b1 - (*x0 > a0);
+}
+
+static inline int
+cmp128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
+{
+    return (a1 < b1 ? -1 : a1 > b1 ? 1 : a0 < b0 ? -1 : a0 > b0 ? 1 : 0);
+}
+
+static inline uint16_t
+fp16_normalise(uint16_t mnt, int *exp)
+{
+    int shift;
+
+    if (!mnt) {
+        return 0;
+    }
+
+    for (shift = 8; shift; shift >>= 1) {
+        if (!(mnt >> (16 - shift))) {
+            mnt <<= shift;
+            *exp -= shift;
+        }
+    }
+    return mnt;
+}
+
+static inline uint32_t
+fp32_normalise(uint32_t mnt, int *exp)
+{
+    int shift;
+
+    if (!mnt) {
+        return 0;
+    }
+
+    for (shift = 16; shift; shift >>= 1) {
+        if (!(mnt >> (32 - shift))) {
+            mnt <<= shift;
+            *exp -= shift;
+        }
+    }
+    return mnt;
+}
+
+static inline uint64_t
+fp64_normalise(uint64_t mnt, int *exp)
+{
+    int shift;
+
+    if (!mnt) {
+        return 0;
+    }
+
+    for (shift = 32; shift; shift >>= 1) {
+        if (!(mnt >> (64 - shift))) {
+            mnt <<= shift;
+            *exp -= shift;
+        }
+    }
+    return mnt;
+}
+
+static inline void
+fp128_normalise(uint64_t *mnt0, uint64_t *mnt1, int *exp)
+{
+    uint64_t x0 = *mnt0;
+    uint64_t x1 = *mnt1;
+    int shift;
+
+    if (!x0 && !x1) {
+        return;
+    }
+
+    if (!x1) {
+        x1 = x0;
+        x0 = 0;
+        *exp -= 64;
+    }
+
+    for (shift = 32; shift; shift >>= 1) {
+        if (!(x1 >> (64 - shift))) {
+            x1 = x1 << shift | x0 >> (64 - shift);
+            x0 <<= shift;
+            *exp -= shift;
+        }
+    }
+
+    *mnt0 = x0;
+    *mnt1 = x1;
+}
+
+static inline uint16_t
+fp16_pack(uint16_t sgn, uint16_t exp, uint16_t mnt)
+{
+    return sgn << 15 | exp << 10 | (mnt & (((uint16_t)1 << 10) - 1));
+}
+
+static inline uint32_t
+fp32_pack(uint32_t sgn, uint32_t exp, uint32_t mnt)
+{
+    return sgn << 31 | exp << 23 | (mnt & (((uint32_t)1 << 23) - 1));
+}
+
+static inline uint64_t
+fp64_pack(uint64_t sgn, uint64_t exp, uint64_t mnt)
+{
+    return (uint64_t)sgn << 63 | exp << 52 | (mnt & (((uint64_t)1 << 52) - 1));
+}
+
+static inline uint16_t
+fp16_zero(int sgn)
+{
+    return fp16_pack(sgn, 0, 0);
+}
+
+static inline uint32_t
+fp32_zero(int sgn)
+{
+    return fp32_pack(sgn, 0, 0);
+}
+
+static inline uint64_t
+fp64_zero(int sgn)
+{
+    return fp64_pack(sgn, 0, 0);
+}
+
+static inline uint16_t
+fp16_max_normal(int sgn)
+{
+    return fp16_pack(sgn, 30, -1);
+}
+
+static inline uint32_t
+fp32_max_normal(int sgn)
+{
+    return fp32_pack(sgn, 254, -1);
+}
+
+static inline uint64_t
+fp64_max_normal(int sgn)
+{
+    return fp64_pack(sgn, 2046, -1);
+}
+
+static inline uint16_t
+fp16_infinity(int sgn)
+{
+    return fp16_pack(sgn, 31, 0);
+}
+
+static inline uint32_t
+fp32_infinity(int sgn)
+{
+    return fp32_pack(sgn, 255, 0);
+}
+
+static inline uint64_t
+fp64_infinity(int sgn)
+{
+    return fp64_pack(sgn, 2047, 0);
+}
+
+static inline uint16_t
+fp16_defaultNaN()
+{
+    return fp16_pack(0, 31, (uint16_t)1 << 9);
+}
+
+static inline uint32_t
+fp32_defaultNaN()
+{
+    return fp32_pack(0, 255, (uint32_t)1 << 22);
+}
+
+static inline uint64_t
+fp64_defaultNaN()
+{
+    return fp64_pack(0, 2047, (uint64_t)1 << 51);
+}
+
+static inline void
+fp16_unpack(int *sgn, int *exp, uint16_t *mnt, uint16_t x, int mode,
+            int *flags)
+{
+    *sgn = x >> 15;
+    *exp = x >> 10 & 31;
+    *mnt = x & (((uint16_t)1 << 10) - 1);
+
+    // Handle subnormals:
+    if (*exp) {
+        *mnt |= (uint16_t)1 << 10;
+    } else {
+        ++*exp;
+        // There is no flush to zero in this case!
+    }
+}
+
+static inline void
+fp32_unpack(int *sgn, int *exp, uint32_t *mnt, uint32_t x, int mode,
+            int *flags)
+{
+    *sgn = x >> 31;
+    *exp = x >> 23 & 255;
+    *mnt = x & (((uint32_t)1 << 23) - 1);
+
+    // Handle subnormals:
+    if (*exp) {
+        *mnt |= (uint32_t)1 << 23;
+    } else {
+        ++*exp;
+        if ((mode & FPLIB_FZ) && *mnt) {
+            *flags |= FPLIB_IDC;
+            *mnt = 0;
+        }
+    }
+}
+
+static inline void
+fp64_unpack(int *sgn, int *exp, uint64_t *mnt, uint64_t x, int mode,
+            int *flags)
+{
+    *sgn = x >> 63;
+    *exp = x >> 52 & 2047;
+    *mnt = x & (((uint64_t)1 << 52) - 1);
+
+    // Handle subnormals:
+    if (*exp) {
+        *mnt |= (uint64_t)1 << 52;
+    } else {
+        ++*exp;
+        if ((mode & FPLIB_FZ) && *mnt) {
+            *flags |= FPLIB_IDC;
+            *mnt = 0;
+        }
+    }
+}
+
+static inline uint32_t
+fp32_process_NaN(uint32_t a, int mode, int *flags)
+{
+    if (!(a >> 22 & 1)) {
+        *flags |= FPLIB_IOC;
+        a |= (uint32_t)1 << 22;
+    }
+    return mode & FPLIB_DN ? fp32_defaultNaN() : a;
+}
+
+static inline uint64_t
+fp64_process_NaN(uint64_t a, int mode, int *flags)
+{
+    if (!(a >> 51 & 1)) {
+        *flags |= FPLIB_IOC;
+        a |= (uint64_t)1 << 51;
+    }
+    return mode & FPLIB_DN ? fp64_defaultNaN() : a;
+}
+
+static uint32_t
+fp32_process_NaNs(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_exp = a >> 23 & 255;
+    uint32_t a_mnt = a & (((uint32_t)1 << 23) - 1);
+    int b_exp = b >> 23 & 255;
+    uint32_t b_mnt = b & (((uint32_t)1 << 23) - 1);
+
+    // Handle signalling NaNs:
+    if (a_exp == 255 && a_mnt && !(a_mnt >> 22 & 1))
+        return fp32_process_NaN(a, mode, flags);
+    if (b_exp == 255 && b_mnt && !(b_mnt >> 22 & 1))
+        return fp32_process_NaN(b, mode, flags);
+
+    // Handle quiet NaNs:
+    if (a_exp == 255 && a_mnt)
+        return fp32_process_NaN(a, mode, flags);
+    if (b_exp == 255 && b_mnt)
+        return fp32_process_NaN(b, mode, flags);
+
+    return 0;
+}
+
+static uint64_t
+fp64_process_NaNs(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_exp = a >> 52 & 2047;
+    uint64_t a_mnt = a & (((uint64_t)1 << 52) - 1);
+    int b_exp = b >> 52 & 2047;
+    uint64_t b_mnt = b & (((uint64_t)1 << 52) - 1);
+
+    // Handle signalling NaNs:
+    if (a_exp == 2047 && a_mnt && !(a_mnt >> 51 & 1))
+        return fp64_process_NaN(a, mode, flags);
+    if (b_exp == 2047 && b_mnt && !(b_mnt >> 51 & 1))
+        return fp64_process_NaN(b, mode, flags);
+
+    // Handle quiet NaNs:
+    if (a_exp == 2047 && a_mnt)
+        return fp64_process_NaN(a, mode, flags);
+    if (b_exp == 2047 && b_mnt)
+        return fp64_process_NaN(b, mode, flags);
+
+    return 0;
+}
+
+static uint32_t
+fp32_process_NaNs3(uint32_t a, uint32_t b, uint32_t c, int mode, int *flags)
+{
+    int a_exp = a >> 23 & 255;
+    uint32_t a_mnt = a & (((uint32_t)1 << 23) - 1);
+    int b_exp = b >> 23 & 255;
+    uint32_t b_mnt = b & (((uint32_t)1 << 23) - 1);
+    int c_exp = c >> 23 & 255;
+    uint32_t c_mnt = c & (((uint32_t)1 << 23) - 1);
+
+    // Handle signalling NaNs:
+    if (a_exp == 255 && a_mnt && !(a_mnt >> 22 & 1))
+        return fp32_process_NaN(a, mode, flags);
+    if (b_exp == 255 && b_mnt && !(b_mnt >> 22 & 1))
+        return fp32_process_NaN(b, mode, flags);
+    if (c_exp == 255 && c_mnt && !(c_mnt >> 22 & 1))
+        return fp32_process_NaN(c, mode, flags);
+
+    // Handle quiet NaNs:
+    if (a_exp == 255 && a_mnt)
+        return fp32_process_NaN(a, mode, flags);
+    if (b_exp == 255 && b_mnt)
+        return fp32_process_NaN(b, mode, flags);
+    if (c_exp == 255 && c_mnt)
+        return fp32_process_NaN(c, mode, flags);
+
+    return 0;
+}
+
+static uint64_t
+fp64_process_NaNs3(uint64_t a, uint64_t b, uint64_t c, int mode, int *flags)
+{
+    int a_exp = a >> 52 & 2047;
+    uint64_t a_mnt = a & (((uint64_t)1 << 52) - 1);
+    int b_exp = b >> 52 & 2047;
+    uint64_t b_mnt = b & (((uint64_t)1 << 52) - 1);
+    int c_exp = c >> 52 & 2047;
+    uint64_t c_mnt = c & (((uint64_t)1 << 52) - 1);
+
+    // Handle signalling NaNs:
+    if (a_exp == 2047 && a_mnt && !(a_mnt >> 51 & 1))
+        return fp64_process_NaN(a, mode, flags);
+    if (b_exp == 2047 && b_mnt && !(b_mnt >> 51 & 1))
+        return fp64_process_NaN(b, mode, flags);
+    if (c_exp == 2047 && c_mnt && !(c_mnt >> 51 & 1))
+        return fp64_process_NaN(c, mode, flags);
+
+    // Handle quiet NaNs:
+    if (a_exp == 2047 && a_mnt)
+        return fp64_process_NaN(a, mode, flags);
+    if (b_exp == 2047 && b_mnt)
+        return fp64_process_NaN(b, mode, flags);
+    if (c_exp == 2047 && c_mnt)
+        return fp64_process_NaN(c, mode, flags);
+
+    return 0;
+}
+
+static uint16_t
+fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags)
+{
+    int biased_exp; // non-negative exponent value for result
+    uint16_t int_mant; // mantissa for result, less than (1 << 11)
+    int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5
+
+    assert(rm != FPRounding_TIEAWAY);
+
+    // There is no flush to zero in this case!
+
+    // The bottom 5 bits of mnt are orred together:
+    mnt = (uint16_t)1 << 12 | mnt >> 4 | ((mnt & 31) != 0);
+
+    if (exp > 0) {
+        biased_exp = exp;
+        int_mant = mnt >> 2;
+        error = mnt & 3;
+    } else {
+        biased_exp = 0;
+        int_mant = lsr16(mnt, 3 - exp);
+        error = (lsr16(mnt, 1 - exp) & 3) | !!(mnt & (lsl16(1, 1 - exp) - 1));
+    }
+
+    if (!biased_exp && error) { // xx should also check fpscr_val<11>
+        *flags |= FPLIB_UFC;
+    }
+
+    // Round up:
+    if ((rm == FPLIB_RN && (error == 3 ||
+                            (error == 2 && (int_mant & 1)))) ||
+        (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) {
+        ++int_mant;
+        if (int_mant == (uint32_t)1 << 10) {
+            // Rounded up from denormalized to normalized
+            biased_exp = 1;
+        }
+        if (int_mant == (uint32_t)1 << 11) {
+            // Rounded up to next exponent
+            ++biased_exp;
+            int_mant >>= 1;
+        }
+    }
+
+    // Handle rounding to odd aka Von Neumann rounding:
+    if (error && rm == FPRounding_ODD)
+        int_mant |= 1;
+
+    // Handle overflow:
+    if (!(mode & FPLIB_AHP)) {
+        if (biased_exp >= 31) {
+            *flags |= FPLIB_OFC | FPLIB_IXC;
+            if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) ||
+                (rm == FPLIB_RM && sgn)) {
+                return fp16_infinity(sgn);
+            } else {
+                return fp16_max_normal(sgn);
+            }
+        }
+    } else {
+        if (biased_exp >= 32) {
+            *flags |= FPLIB_IOC;
+            return fp16_pack(sgn, 31, -1);
+        }
+    }
+
+    if (error) {
+        *flags |= FPLIB_IXC;
+    }
+
+    return fp16_pack(sgn, biased_exp, int_mant);
+}
+
+static uint32_t
+fp32_round_(int sgn, int exp, uint32_t mnt, int rm, int mode, int *flags)
+{
+    int biased_exp; // non-negative exponent value for result
+    uint32_t int_mant; // mantissa for result, less than (1 << 24)
+    int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5
+
+    assert(rm != FPRounding_TIEAWAY);
+
+    // Flush to zero:
+    if ((mode & FPLIB_FZ) && exp < 1) {
+        *flags |= FPLIB_UFC;
+        return fp32_zero(sgn);
+    }
+
+    // The bottom 8 bits of mnt are orred together:
+    mnt = (uint32_t)1 << 25 | mnt >> 7 | ((mnt & 255) != 0);
+
+    if (exp > 0) {
+        biased_exp = exp;
+        int_mant = mnt >> 2;
+        error = mnt & 3;
+    } else {
+        biased_exp = 0;
+        int_mant = lsr32(mnt, 3 - exp);
+        error = (lsr32(mnt, 1 - exp) & 3) | !!(mnt & (lsl32(1, 1 - exp) - 1));
+    }
+
+    if (!biased_exp && error) { // xx should also check fpscr_val<11>
+        *flags |= FPLIB_UFC;
+    }
+
+    // Round up:
+    if ((rm == FPLIB_RN && (error == 3 ||
+                            (error == 2 && (int_mant & 1)))) ||
+        (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) {
+        ++int_mant;
+        if (int_mant == (uint32_t)1 << 23) {
+            // Rounded up from denormalized to normalized
+            biased_exp = 1;
+        }
+        if (int_mant == (uint32_t)1 << 24) {
+            // Rounded up to next exponent
+            ++biased_exp;
+            int_mant >>= 1;
+        }
+    }
+
+    // Handle rounding to odd aka Von Neumann rounding:
+    if (error && rm == FPRounding_ODD)
+        int_mant |= 1;
+
+    // Handle overflow:
+    if (biased_exp >= 255) {
+        *flags |= FPLIB_OFC | FPLIB_IXC;
+        if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) ||
+            (rm == FPLIB_RM && sgn)) {
+            return fp32_infinity(sgn);
+        } else {
+            return fp32_max_normal(sgn);
+        }
+    }
+
+    if (error) {
+        *flags |= FPLIB_IXC;
+    }
+
+    return fp32_pack(sgn, biased_exp, int_mant);
+}
+
+static uint32_t
+fp32_round(int sgn, int exp, uint32_t mnt, int mode, int *flags)
+{
+    return fp32_round_(sgn, exp, mnt, mode & 3, mode, flags);
+}
+
+static uint64_t
+fp64_round_(int sgn, int exp, uint64_t mnt, int rm, int mode, int *flags)
+{
+    int biased_exp; // non-negative exponent value for result
+    uint64_t int_mant; // mantissa for result, less than (1 << 52)
+    int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5
+
+    assert(rm != FPRounding_TIEAWAY);
+
+    // Flush to zero:
+    if ((mode & FPLIB_FZ) && exp < 1) {
+        *flags |= FPLIB_UFC;
+        return fp64_zero(sgn);
+    }
+
+    // The bottom 11 bits of mnt are orred together:
+    mnt = (uint64_t)1 << 54 | mnt >> 10 | ((mnt & 0x3ff) != 0);
+
+    if (exp > 0) {
+        biased_exp = exp;
+        int_mant = mnt >> 2;
+        error = mnt & 3;
+    } else {
+        biased_exp = 0;
+        int_mant = lsr64(mnt, 3 - exp);
+        error = (lsr64(mnt, 1 - exp) & 3) | !!(mnt & (lsl64(1, 1 - exp) - 1));
+    }
+
+    if (!biased_exp && error) { // xx should also check fpscr_val<11>
+        *flags |= FPLIB_UFC;
+    }
+
+    // Round up:
+    if ((rm == FPLIB_RN && (error == 3 ||
+                            (error == 2 && (int_mant & 1)))) ||
+        (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) {
+        ++int_mant;
+        if (int_mant == (uint64_t)1 << 52) {
+            // Rounded up from denormalized to normalized
+            biased_exp = 1;
+        }
+        if (int_mant == (uint64_t)1 << 53) {
+            // Rounded up to next exponent
+            ++biased_exp;
+            int_mant >>= 1;
+        }
+    }
+
+    // Handle rounding to odd aka Von Neumann rounding:
+    if (error && rm == FPRounding_ODD)
+        int_mant |= 1;
+
+    // Handle overflow:
+    if (biased_exp >= 2047) {
+        *flags |= FPLIB_OFC | FPLIB_IXC;
+        if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) ||
+            (rm == FPLIB_RM && sgn)) {
+            return fp64_infinity(sgn);
+        } else {
+            return fp64_max_normal(sgn);
+        }
+    }
+
+    if (error) {
+        *flags |= FPLIB_IXC;
+    }
+
+    return fp64_pack(sgn, biased_exp, int_mant);
+}
+
+static uint64_t
+fp64_round(int sgn, int exp, uint64_t mnt, int mode, int *flags)
+{
+    return fp64_round_(sgn, exp, mnt, mode & 3, mode, flags);
+}
+
+static int
+fp32_compare_eq(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint32_t a_mnt, b_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) ||
+        (b_exp == 255 && (uint32_t)(b_mnt << 9))) {
+        if ((a_exp == 255 && (uint32_t)(a_mnt << 9) && !(a >> 22 & 1)) ||
+            (b_exp == 255 && (uint32_t)(b_mnt << 9) && !(b >> 22 & 1)))
+            *flags |= FPLIB_IOC;
+        return 0;
+    }
+    return a == b || (!a_mnt && !b_mnt);
+}
+
+static int
+fp32_compare_ge(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint32_t a_mnt, b_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) ||
+        (b_exp == 255 && (uint32_t)(b_mnt << 9))) {
+        *flags |= FPLIB_IOC;
+        return 0;
+    }
+    if (!a_mnt && !b_mnt)
+        return 1;
+    if (a_sgn != b_sgn)
+        return b_sgn;
+    if (a_exp != b_exp)
+        return a_sgn ^ (a_exp > b_exp);
+    if (a_mnt != b_mnt)
+        return a_sgn ^ (a_mnt > b_mnt);
+    return 1;
+}
+
+static int
+fp32_compare_gt(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint32_t a_mnt, b_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) ||
+        (b_exp == 255 && (uint32_t)(b_mnt << 9))) {
+        *flags |= FPLIB_IOC;
+        return 0;
+    }
+    if (!a_mnt && !b_mnt)
+        return 0;
+    if (a_sgn != b_sgn)
+        return b_sgn;
+    if (a_exp != b_exp)
+        return a_sgn ^ (a_exp > b_exp);
+    if (a_mnt != b_mnt)
+        return a_sgn ^ (a_mnt > b_mnt);
+    return 0;
+}
+
+static int
+fp64_compare_eq(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint64_t a_mnt, b_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) ||
+        (b_exp == 2047 && (uint64_t)(b_mnt << 12))) {
+        if ((a_exp == 2047 && (uint64_t)(a_mnt << 12) && !(a >> 51 & 1)) ||
+            (b_exp == 2047 && (uint64_t)(b_mnt << 12) && !(b >> 51 & 1)))
+            *flags |= FPLIB_IOC;
+        return 0;
+    }
+    return a == b || (!a_mnt && !b_mnt);
+}
+
+static int
+fp64_compare_ge(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint64_t a_mnt, b_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) ||
+        (b_exp == 2047 && (uint64_t)(b_mnt << 12))) {
+        *flags |= FPLIB_IOC;
+        return 0;
+    }
+    if (!a_mnt && !b_mnt)
+        return 1;
+    if (a_sgn != b_sgn)
+        return b_sgn;
+    if (a_exp != b_exp)
+        return a_sgn ^ (a_exp > b_exp);
+    if (a_mnt != b_mnt)
+        return a_sgn ^ (a_mnt > b_mnt);
+    return 1;
+}
+
+static int
+fp64_compare_gt(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp;
+    uint64_t a_mnt, b_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) ||
+        (b_exp == 2047 && (uint64_t)(b_mnt << 12))) {
+        *flags |= FPLIB_IOC;
+        return 0;
+    }
+    if (!a_mnt && !b_mnt)
+        return 0;
+    if (a_sgn != b_sgn)
+        return b_sgn;
+    if (a_exp != b_exp)
+        return a_sgn ^ (a_exp > b_exp);
+    if (a_mnt != b_mnt)
+        return a_sgn ^ (a_mnt > b_mnt);
+    return 0;
+}
+
+static uint32_t
+fp32_add(uint32_t a, uint32_t b, int neg, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp;
+    uint32_t a_mnt, b_mnt, x, x_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp32_process_NaNs(a, b, mode, flags))) {
+        return x;
+    }
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 255 && b_exp == 255 && a_sgn != b_sgn) {
+        *flags |= FPLIB_IOC;
+        return fp32_defaultNaN();
+    } else if (a_exp == 255) {
+        return fp32_infinity(a_sgn);
+    } else if (b_exp == 255) {
+        return fp32_infinity(b_sgn);
+    } else if (!a_mnt && !b_mnt && a_sgn == b_sgn) {
+        return fp32_zero(a_sgn);
+    }
+
+    a_mnt <<= 3;
+    b_mnt <<= 3;
+    if (a_exp >= b_exp) {
+        b_mnt = (lsr32(b_mnt, a_exp - b_exp) |
+                 !!(b_mnt & (lsl32(1, a_exp - b_exp) - 1)));
+        b_exp = a_exp;
+    } else {
+        a_mnt = (lsr32(a_mnt, b_exp - a_exp) |
+                 !!(a_mnt & (lsl32(1, b_exp - a_exp) - 1)));
+        a_exp = b_exp;
+    }
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x_mnt = a_mnt + b_mnt;
+    } else if (a_mnt >= b_mnt) {
+        x_mnt = a_mnt - b_mnt;
+    } else {
+        x_sgn ^= 1;
+        x_mnt = b_mnt - a_mnt;
+    }
+
+    if (!x_mnt) {
+        // Sign of exact zero result depends on rounding mode
+        return fp32_zero((mode & 3) == 2);
+    }
+
+    x_mnt = fp32_normalise(x_mnt, &x_exp);
+
+    return fp32_round(x_sgn, x_exp + 5, x_mnt << 1, mode, flags);
+}
+
+static uint64_t
+fp64_add(uint64_t a, uint64_t b, int neg, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp;
+    uint64_t a_mnt, b_mnt, x, x_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp64_process_NaNs(a, b, mode, flags))) {
+        return x;
+    }
+
+    b_sgn ^= neg;
+
+    // Handle infinities and zeroes:
+    if (a_exp == 2047 && b_exp == 2047 && a_sgn != b_sgn) {
+        *flags |= FPLIB_IOC;
+        return fp64_defaultNaN();
+    } else if (a_exp == 2047) {
+        return fp64_infinity(a_sgn);
+    } else if (b_exp == 2047) {
+        return fp64_infinity(b_sgn);
+    } else if (!a_mnt && !b_mnt && a_sgn == b_sgn) {
+        return fp64_zero(a_sgn);
+    }
+
+    a_mnt <<= 3;
+    b_mnt <<= 3;
+    if (a_exp >= b_exp) {
+        b_mnt = (lsr64(b_mnt, a_exp - b_exp) |
+                 !!(b_mnt & (lsl64(1, a_exp - b_exp) - 1)));
+        b_exp = a_exp;
+    } else {
+        a_mnt = (lsr64(a_mnt, b_exp - a_exp) |
+                 !!(a_mnt & (lsl64(1, b_exp - a_exp) - 1)));
+        a_exp = b_exp;
+    }
+    x_sgn = a_sgn;
+    x_exp = a_exp;
+    if (a_sgn == b_sgn) {
+        x_mnt = a_mnt + b_mnt;
+    } else if (a_mnt >= b_mnt) {
+        x_mnt = a_mnt - b_mnt;
+    } else {
+        x_sgn ^= 1;
+        x_mnt = b_mnt - a_mnt;
+    }
+
+    if (!x_mnt) {
+        // Sign of exact zero result depends on rounding mode
+        return fp64_zero((mode & 3) == 2);
+    }
+
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+
+    return fp64_round(x_sgn, x_exp + 8, x_mnt << 1, mode, flags);
+}
+
+static uint32_t
+fp32_mul(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp;
+    uint32_t a_mnt, b_mnt, x;
+    uint64_t x_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp32_process_NaNs(a, b, mode, flags))) {
+        return x;
+    }
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 255 && !b_mnt) || (b_exp == 255 && !a_mnt)) {
+        *flags |= FPLIB_IOC;
+        return fp32_defaultNaN();
+    } else if (a_exp == 255 || b_exp == 255) {
+        return fp32_infinity(a_sgn ^ b_sgn);
+    } else if (!a_mnt || !b_mnt) {
+        return fp32_zero(a_sgn ^ b_sgn);
+    }
+
+    // Multiply and normalise:
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 110;
+    x_mnt = (uint64_t)a_mnt * b_mnt;
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+
+    // Convert to 32 bits, collapsing error into bottom bit:
+    x_mnt = lsr64(x_mnt, 31) | !!lsl64(x_mnt, 33);
+
+    return fp32_round(x_sgn, x_exp, x_mnt, mode, flags);
+}
+
+static uint64_t
+fp64_mul(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp;
+    uint64_t a_mnt, b_mnt, x;
+    uint64_t x0_mnt, x1_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp64_process_NaNs(a, b, mode, flags))) {
+        return x;
+    }
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 2047 && !b_mnt) || (b_exp == 2047 && !a_mnt)) {
+        *flags |= FPLIB_IOC;
+        return fp64_defaultNaN();
+    } else if (a_exp == 2047 || b_exp == 2047) {
+        return fp64_infinity(a_sgn ^ b_sgn);
+    } else if (!a_mnt || !b_mnt) {
+        return fp64_zero(a_sgn ^ b_sgn);
+    }
+
+    // Multiply and normalise:
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp + b_exp - 1000;
+    mul62x62(&x0_mnt, &x1_mnt, a_mnt, b_mnt);
+    fp128_normalise(&x0_mnt, &x1_mnt, &x_exp);
+
+    // Convert to 64 bits, collapsing error into bottom bit:
+    x0_mnt = x1_mnt << 1 | !!x0_mnt;
+
+    return fp64_round(x_sgn, x_exp, x0_mnt, mode, flags);
+}
+
+static uint32_t
+fp32_muladd(uint32_t a, uint32_t b, uint32_t c, int scale,
+            int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, c_sgn, c_exp, x_sgn, x_exp, y_sgn, y_exp;
+    uint32_t a_mnt, b_mnt, c_mnt, x;
+    uint64_t x_mnt, y_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+    fp32_unpack(&c_sgn, &c_exp, &c_mnt, c, mode, flags);
+
+    x = fp32_process_NaNs3(a, b, c, mode, flags);
+
+    // Quiet NaN added to product of zero and infinity:
+    if (a_exp == 255 && (a_mnt >> 22 & 1) &&
+        ((!b_mnt && c_exp == 255 && !(uint32_t)(c_mnt << 9)) ||
+         (!c_mnt && b_exp == 255 && !(uint32_t)(b_mnt << 9)))) {
+        x = fp32_defaultNaN();
+        *flags |= FPLIB_IOC;
+    }
+
+    if (x) {
+        return x;
+    }
+
+    // Handle infinities and zeroes:
+    if ((b_exp == 255 && !c_mnt) ||
+        (c_exp == 255 && !b_mnt) ||
+        (a_exp == 255 && (b_exp == 255 || c_exp == 255) &&
+         (a_sgn != (b_sgn ^ c_sgn)))) {
+        *flags |= FPLIB_IOC;
+        return fp32_defaultNaN();
+    }
+    if (a_exp == 255)
+        return fp32_infinity(a_sgn);
+    if (b_exp == 255 || c_exp == 255)
+        return fp32_infinity(b_sgn ^ c_sgn);
+    if (!a_mnt && (!b_mnt || !c_mnt) && a_sgn == (b_sgn ^ c_sgn))
+        return fp32_zero(a_sgn);
+
+    x_sgn = a_sgn;
+    x_exp = a_exp + 13;
+    x_mnt = (uint64_t)a_mnt << 27;
+
+    // Multiply:
+    y_sgn = b_sgn ^ c_sgn;
+    y_exp = b_exp + c_exp - 113;
+    y_mnt = (uint64_t)b_mnt * c_mnt << 3;
+    if (!y_mnt) {
+        y_exp = x_exp;
+    }
+
+    // Add:
+    if (x_exp >= y_exp) {
+        y_mnt = (lsr64(y_mnt, x_exp - y_exp) |
+                 !!(y_mnt & (lsl64(1, x_exp - y_exp) - 1)));
+        y_exp = x_exp;
+    } else {
+        x_mnt = (lsr64(x_mnt, y_exp - x_exp) |
+                 !!(x_mnt & (lsl64(1, y_exp - x_exp) - 1)));
+        x_exp = y_exp;
+    }
+    if (x_sgn == y_sgn) {
+        x_mnt = x_mnt + y_mnt;
+    } else if (x_mnt >= y_mnt) {
+        x_mnt = x_mnt - y_mnt;
+    } else {
+        x_sgn ^= 1;
+        x_mnt = y_mnt - x_mnt;
+    }
+
+    if (!x_mnt) {
+        // Sign of exact zero result depends on rounding mode
+        return fp32_zero((mode & 3) == 2);
+    }
+
+    // Normalise and convert to 32 bits, collapsing error into bottom bit:
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+    x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1);
+
+    return fp32_round(x_sgn, x_exp + scale, x_mnt, mode, flags);
+}
+
+static uint64_t
+fp64_muladd(uint64_t a, uint64_t b, uint64_t c, int scale,
+            int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, c_sgn, c_exp, x_sgn, x_exp, y_sgn, y_exp;
+    uint64_t a_mnt, b_mnt, c_mnt, x;
+    uint64_t x0_mnt, x1_mnt, y0_mnt, y1_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+    fp64_unpack(&c_sgn, &c_exp, &c_mnt, c, mode, flags);
+
+    x = fp64_process_NaNs3(a, b, c, mode, flags);
+
+    // Quiet NaN added to product of zero and infinity:
+    if (a_exp == 2047 && (a_mnt >> 51 & 1) &&
+        ((!b_mnt && c_exp == 2047 && !(uint64_t)(c_mnt << 12)) ||
+         (!c_mnt && b_exp == 2047 && !(uint64_t)(b_mnt << 12)))) {
+        x = fp64_defaultNaN();
+        *flags |= FPLIB_IOC;
+    }
+
+    if (x) {
+        return x;
+    }
+
+    // Handle infinities and zeroes:
+    if ((b_exp == 2047 && !c_mnt) ||
+        (c_exp == 2047 && !b_mnt) ||
+        (a_exp == 2047 && (b_exp == 2047 || c_exp == 2047) &&
+         (a_sgn != (b_sgn ^ c_sgn)))) {
+        *flags |= FPLIB_IOC;
+        return fp64_defaultNaN();
+    }
+    if (a_exp == 2047)
+        return fp64_infinity(a_sgn);
+    if (b_exp == 2047 || c_exp == 2047)
+        return fp64_infinity(b_sgn ^ c_sgn);
+    if (!a_mnt && (!b_mnt || !c_mnt) && a_sgn == (b_sgn ^ c_sgn))
+        return fp64_zero(a_sgn);
+
+    x_sgn = a_sgn;
+    x_exp = a_exp + 11;
+    x0_mnt = 0;
+    x1_mnt = a_mnt;
+
+    // Multiply:
+    y_sgn = b_sgn ^ c_sgn;
+    y_exp = b_exp + c_exp - 1003;
+    mul62x62(&y0_mnt, &y1_mnt, b_mnt, c_mnt << 3);
+    if (!y0_mnt && !y1_mnt) {
+        y_exp = x_exp;
+    }
+
+    // Add:
+    if (x_exp >= y_exp) {
+        uint64_t t0, t1;
+        lsl128(&t0, &t1, y0_mnt, y1_mnt,
+               x_exp - y_exp < 128 ? 128 - (x_exp - y_exp) : 0);
+        lsr128(&y0_mnt, &y1_mnt, y0_mnt, y1_mnt, x_exp - y_exp);
+        y0_mnt |= !!(t0 | t1);
+        y_exp = x_exp;
+    } else {
+        uint64_t t0, t1;
+        lsl128(&t0, &t1, x0_mnt, x1_mnt,
+               y_exp - x_exp < 128 ? 128 - (y_exp - x_exp) : 0);
+        lsr128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, y_exp - x_exp);
+        x0_mnt |= !!(t0 | t1);
+        x_exp = y_exp;
+    }
+    if (x_sgn == y_sgn) {
+        add128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, y0_mnt, y1_mnt);
+    } else if (cmp128(x0_mnt, x1_mnt, y0_mnt, y1_mnt) >= 0) {
+        sub128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, y0_mnt, y1_mnt);
+    } else {
+        x_sgn ^= 1;
+        sub128(&x0_mnt, &x1_mnt, y0_mnt, y1_mnt, x0_mnt, x1_mnt);
+    }
+
+    if (!x0_mnt && !x1_mnt) {
+        // Sign of exact zero result depends on rounding mode
+        return fp64_zero((mode & 3) == 2);
+    }
+
+    // Normalise and convert to 64 bits, collapsing error into bottom bit:
+    fp128_normalise(&x0_mnt, &x1_mnt, &x_exp);
+    x0_mnt = x1_mnt << 1 | !!x0_mnt;
+
+    return fp64_round(x_sgn, x_exp + scale, x0_mnt, mode, flags);
+}
+
+static uint32_t
+fp32_div(uint32_t a, uint32_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp;
+    uint32_t a_mnt, b_mnt, x;
+    uint64_t x_mnt;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp32_process_NaNs(a, b, mode, flags)))
+        return x;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 255 && b_exp == 255) || (!a_mnt && !b_mnt)) {
+        *flags |= FPLIB_IOC;
+        return fp32_defaultNaN();
+    }
+    if (a_exp == 255 || !b_mnt) {
+        if (a_exp != 255)
+            *flags |= FPLIB_DZC;
+        return fp32_infinity(a_sgn ^ b_sgn);
+    }
+    if (!a_mnt || b_exp == 255)
+        return fp32_zero(a_sgn ^ b_sgn);
+
+    // Divide, setting bottom bit if inexact:
+    a_mnt = fp32_normalise(a_mnt, &a_exp);
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 172;
+    x_mnt = ((uint64_t)a_mnt << 18) / b_mnt;
+    x_mnt |= (x_mnt * b_mnt != (uint64_t)a_mnt << 18);
+
+    // Normalise and convert to 32 bits, collapsing error into bottom bit:
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+    x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1);
+
+    return fp32_round(x_sgn, x_exp, x_mnt, mode, flags);
+}
+
+static uint64_t
+fp64_div(uint64_t a, uint64_t b, int mode, int *flags)
+{
+    int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp, c;
+    uint64_t a_mnt, b_mnt, x, x_mnt, x0_mnt, x1_mnt;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+    fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags);
+
+    if ((x = fp64_process_NaNs(a, b, mode, flags)))
+        return x;
+
+    // Handle infinities and zeroes:
+    if ((a_exp == 2047 && b_exp == 2047) || (!a_mnt && !b_mnt)) {
+        *flags |= FPLIB_IOC;
+        return fp64_defaultNaN();
+    }
+    if (a_exp == 2047 || !b_mnt) {
+        if (a_exp != 2047)
+            *flags |= FPLIB_DZC;
+        return fp64_infinity(a_sgn ^ b_sgn);
+    }
+    if (!a_mnt || b_exp == 2047)
+        return fp64_zero(a_sgn ^ b_sgn);
+
+    // Find reciprocal of divisor with Newton-Raphson:
+    a_mnt = fp64_normalise(a_mnt, &a_exp);
+    b_mnt = fp64_normalise(b_mnt, &b_exp);
+    x_mnt = ~(uint64_t)0 / (b_mnt >> 31);
+    mul64x32(&x0_mnt, &x1_mnt, b_mnt, x_mnt);
+    sub128(&x0_mnt, &x1_mnt, 0, (uint64_t)1 << 32, x0_mnt, x1_mnt);
+    lsr128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, 32);
+    mul64x32(&x0_mnt, &x1_mnt, x0_mnt, x_mnt);
+    lsr128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, 33);
+
+    // Multiply by dividend:
+    x_sgn = a_sgn ^ b_sgn;
+    x_exp = a_exp - b_exp + 1031;
+    mul62x62(&x0_mnt, &x1_mnt, x0_mnt, a_mnt >> 2); // xx 62x62 is enough
+    lsr128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, 4);
+    x_mnt = x1_mnt;
+
+    // This is an underestimate, so try adding one:
+    mul62x62(&x0_mnt, &x1_mnt, b_mnt >> 2, x_mnt + 1); // xx 62x62 is enough
+    c = cmp128(x0_mnt, x1_mnt, 0, a_mnt >> 11);
+    if (c <= 0) {
+        ++x_mnt;
+    }
+
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+
+    return fp64_round(x_sgn, x_exp, x_mnt << 1 | !!c, mode, flags);
+}
+
+static void
+set_fpscr0(FPSCR &fpscr, int flags)
+{
+    if (flags & FPLIB_IDC) {
+        fpscr.idc = 1;
+    }
+    if (flags & FPLIB_IOC) {
+        fpscr.ioc = 1;
+    }
+    if (flags & FPLIB_DZC) {
+        fpscr.dzc = 1;
+    }
+    if (flags & FPLIB_OFC) {
+        fpscr.ofc = 1;
+    }
+    if (flags & FPLIB_UFC) {
+        fpscr.ufc = 1;
+    }
+    if (flags & FPLIB_IXC) {
+        fpscr.ixc = 1;
+    }
+}
+
+static uint32_t
+fp32_sqrt(uint32_t a, int mode, int *flags)
+{
+    int a_sgn, a_exp, x_sgn, x_exp;
+    uint32_t a_mnt, x, x_mnt;
+    uint64_t t0, t1;
+
+    fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+
+    // Handle NaNs:
+    if (a_exp == 255 && (uint32_t)(a_mnt << 9))
+        return fp32_process_NaN(a, mode, flags);
+
+    // Handle infinities and zeroes:
+    if (!a_mnt) {
+        return fp32_zero(a_sgn);
+    }
+    if (a_exp == 255 && !a_sgn) {
+        return fp32_infinity(a_sgn);
+    }
+    if (a_sgn) {
+        *flags |= FPLIB_IOC;
+        return fp32_defaultNaN();
+    }
+
+    a_mnt = fp32_normalise(a_mnt, &a_exp);
+    if (!(a_exp & 1)) {
+        ++a_exp;
+        a_mnt >>= 1;
+    }
+
+    // x = (a * 3 + 5) / 8
+    x = (a_mnt >> 2) + (a_mnt >> 3) + (5 << 28);
+
+    // x = (a / x + x) / 2; // 16-bit accuracy
+    x = (a_mnt / (x >> 15) + (x >> 16)) << 15;
+
+    // x = (a / x + x) / 2; // 16-bit accuracy
+    x = (a_mnt / (x >> 15) + (x >> 16)) << 15;
+
+    // x = (a / x + x) / 2; // 32-bit accuracy
+    x = ((((uint64_t)a_mnt << 32) / x) >> 2) + (x >> 1);
+
+    x_sgn = 0;
+    x_exp = (a_exp + 147) >> 1;
+    x_mnt = ((x - (1 << 5)) >> 6) + 1;
+    t1 = (uint64_t)x_mnt * x_mnt;
+    t0 = (uint64_t)a_mnt << 19;
+    if (t1 > t0) {
+        --x_mnt;
+    }
+
+    x_mnt = fp32_normalise(x_mnt, &x_exp);
+
+    return fp32_round(x_sgn, x_exp, x_mnt << 1 | (t1 != t0), mode, flags);
+}
+
+static uint64_t
+fp64_sqrt(uint64_t a, int mode, int *flags)
+{
+    int a_sgn, a_exp, x_sgn, x_exp, c;
+    uint64_t a_mnt, x_mnt, r, x0, x1;
+    uint32_t x;
+
+    fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags);
+
+    // Handle NaNs:
+    if (a_exp == 2047 && (uint64_t)(a_mnt << 12)) {
+        return fp64_process_NaN(a, mode, flags);
+    }
+
+    // Handle infinities and zeroes:
+    if (!a_mnt)
+        return fp64_zero(a_sgn);
+    if (a_exp == 2047 && !a_sgn)
+        return fp64_infinity(a_sgn);
+    if (a_sgn) {
+        *flags |= FPLIB_IOC;
+        return fp64_defaultNaN();
+    }
+
+    a_mnt = fp64_normalise(a_mnt, &a_exp);
+    if (a_exp & 1) {
+        ++a_exp;
+        a_mnt >>= 1;
+    }
+
+    // x = (a * 3 + 5) / 8
+    x = (a_mnt >> 34) + (a_mnt >> 35) + (5 << 28);
+
+    // x = (a / x + x) / 2; // 16-bit accuracy
+    x = ((a_mnt >> 32) / (x >> 15) + (x >> 16)) << 15;
+
+    // x = (a / x + x) / 2; // 16-bit accuracy
+    x = ((a_mnt >> 32) / (x >> 15) + (x >> 16)) << 15;
+
+    // x = (a / x + x) / 2; // 32-bit accuracy
+    x = ((a_mnt / x) >> 2) + (x >> 1);
+
+    // r = 1 / x; // 32-bit accuracy
+    r = ((uint64_t)1 << 62) / x;
+
+    // r = r * (2 - x * r); // 64-bit accuracy
+    mul64x32(&x0, &x1, -(uint64_t)x * r << 1, r);
+    lsr128(&x0, &x1, x0, x1, 31);
+
+    // x = (x + a * r) / 2; // 64-bit accuracy
+    mul62x62(&x0, &x1, a_mnt >> 10, x0 >> 2);
+    lsl128(&x0, &x1, x0, x1, 5);
+    lsr128(&x0, &x1, x0, x1, 56);
+
+    x0 = ((uint64_t)x << 31) + (x0 >> 1);
+
+    x_sgn = 0;
+    x_exp = (a_exp + 1053) >> 1;
+    x_mnt = x0;
+    x_mnt = ((x_mnt - (1 << 8)) >> 9) + 1;
+    mul62x62(&x0, &x1, x_mnt, x_mnt);
+    lsl128(&x0, &x1, x0, x1, 19);
+    c = cmp128(x0, x1, 0, a_mnt);
+    if (c > 0)
+        --x_mnt;
+
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+
+    return fp64_round(x_sgn, x_exp, x_mnt << 1 | !!c, mode, flags);
+}
+
+static int
+modeConv(FPSCR fpscr)
+{
+    return (((int) fpscr) >> 22) & 0xF;
+}
+
+static void
+set_fpscr(FPSCR &fpscr, int flags)
+{
+    // translate back to FPSCR
+    bool underflow = false;
+    if (flags & FPLIB_IDC) {
+        fpscr.idc = 1;
+    }
+    if (flags & FPLIB_IOC) {
+        fpscr.ioc = 1;
+    }
+    if (flags & FPLIB_DZC) {
+        fpscr.dzc = 1;
+    }
+    if (flags & FPLIB_OFC) {
+        fpscr.ofc = 1;
+    }
+    if (flags & FPLIB_UFC) {
+        underflow = true; //xx Why is this required?
+        fpscr.ufc = 1;
+    }
+    if ((flags & FPLIB_IXC) && !(underflow && fpscr.fz)) {
+        fpscr.ixc = 1;
+    }
+}
+
+template <>
+bool
+fplibCompareEQ(uint32_t a, uint32_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp32_compare_eq(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+bool
+fplibCompareGE(uint32_t a, uint32_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp32_compare_ge(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+bool
+fplibCompareGT(uint32_t a, uint32_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp32_compare_gt(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+bool
+fplibCompareEQ(uint64_t a, uint64_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp64_compare_eq(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+bool
+fplibCompareGE(uint64_t a, uint64_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp64_compare_ge(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+bool
+fplibCompareGT(uint64_t a, uint64_t b, FPSCR &fpscr)
+{
+    int flags = 0;
+    int x = fp64_compare_gt(a, b, modeConv(fpscr), &flags);
+    set_fpscr(fpscr, flags);
+    return x;
+}
+
+template <>
+uint32_t
+fplibAbs(uint32_t op)
+{
+    return op & ~((uint32_t)1 << 31);
+}
+
+template <>
+uint64_t
+fplibAbs(uint64_t op)
+{
+    return op & ~((uint64_t)1 << 63);
+}
+
+template <>
+uint32_t
+fplibAdd(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_add(op1, op2, 0, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibAdd(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_add(op1, op2, 0, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+int
+fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2, result;
+    uint32_t mnt1, mnt2;
+
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((exp1 == 255 && (uint32_t)(mnt1 << 9)) ||
+        (exp2 == 255 && (uint32_t)(mnt2 << 9))) {
+        result = 3;
+        if ((exp1 == 255 && (uint32_t)(mnt1 << 9) && !(mnt1 >> 22 & 1)) ||
+            (exp2 == 255 && (uint32_t)(mnt2 << 9) && !(mnt2 >> 22 & 1)) ||
+            signal_nans)
+            flags |= FPLIB_IOC;
+    } else {
+        if (op1 == op2 || (!mnt1 && !mnt2)) {
+            result = 6;
+        } else if (sgn1 != sgn2) {
+            result = sgn1 ? 8 : 2;
+        } else if (exp1 != exp2) {
+            result = sgn1 ^ (exp1 < exp2) ? 8 : 2;
+        } else {
+            result = sgn1 ^ (mnt1 < mnt2) ? 8 : 2;
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+int
+fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2, result;
+    uint64_t mnt1, mnt2;
+
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((exp1 == 2047 && (uint64_t)(mnt1 << 12)) ||
+        (exp2 == 2047 && (uint64_t)(mnt2 << 12))) {
+        result = 3;
+        if ((exp1 == 2047 && (uint64_t)(mnt1 << 12) && !(mnt1 >> 51 & 1)) ||
+            (exp2 == 2047 && (uint64_t)(mnt2 << 12) && !(mnt2 >> 51 & 1)) ||
+            signal_nans)
+            flags |= FPLIB_IOC;
+    } else {
+        if (op1 == op2 || (!mnt1 && !mnt2)) {
+            result = 6;
+        } else if (sgn1 != sgn2) {
+            result = sgn1 ? 8 : 2;
+        } else if (exp1 != exp2) {
+            result = sgn1 ^ (exp1 < exp2) ? 8 : 2;
+        } else {
+            result = sgn1 ^ (mnt1 < mnt2) ? 8 : 2;
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+static uint16_t
+fp16_FPConvertNaN_32(uint32_t op)
+{
+    return fp16_pack(op >> 31, 31, (uint16_t)1 << 9 | op >> 13);
+}
+
+static uint16_t
+fp16_FPConvertNaN_64(uint64_t op)
+{
+    return fp16_pack(op >> 63, 31, (uint16_t)1 << 9 | op >> 42);
+}
+
+static uint32_t
+fp32_FPConvertNaN_16(uint16_t op)
+{
+    return fp32_pack(op >> 15, 255, (uint32_t)1 << 22 | (uint32_t)op << 13);
+}
+
+static uint32_t
+fp32_FPConvertNaN_64(uint64_t op)
+{
+    return fp32_pack(op >> 63, 255, (uint32_t)1 << 22 | op >> 29);
+}
+
+static uint64_t
+fp64_FPConvertNaN_16(uint16_t op)
+{
+    return fp64_pack(op >> 15, 2047, (uint64_t)1 << 51 | (uint64_t)op << 42);
+}
+
+static uint64_t
+fp64_FPConvertNaN_32(uint32_t op)
+{
+    return fp64_pack(op >> 31, 2047, (uint64_t)1 << 51 | (uint64_t)op << 29);
+}
+
+static uint32_t
+fp32_FPOnePointFive(int sgn)
+{
+    return fp32_pack(sgn, 127, (uint64_t)1 << 22);
+}
+
+static uint64_t
+fp64_FPOnePointFive(int sgn)
+{
+    return fp64_pack(sgn, 1023, (uint64_t)1 << 51);
+}
+
+static uint32_t
+fp32_FPThree(int sgn)
+{
+    return fp32_pack(sgn, 128, (uint64_t)1 << 22);
+}
+
+static uint64_t
+fp64_FPThree(int sgn)
+{
+    return fp64_pack(sgn, 1024, (uint64_t)1 << 51);
+}
+
+static uint32_t
+fp32_FPTwo(int sgn)
+{
+    return fp32_pack(sgn, 128, 0);
+}
+
+static uint64_t
+fp64_FPTwo(int sgn)
+{
+    return fp64_pack(sgn, 1024, 0);
+}
+
+template <>
+uint16_t
+fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt;
+    uint16_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    bool alt_hp = fpscr.ahp;
+
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        if (alt_hp) {
+            result = fp16_zero(sgn);
+        } else if (fpscr.dn) {
+            result = fp16_defaultNaN();
+        } else {
+            result = fp16_FPConvertNaN_32(op);
+        }
+        if (!(mnt >> 22 & 1) || alt_hp) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 255) {
+        if (alt_hp) {
+            result = sgn << 15 | (uint16_t)0x7fff;
+            flags |= FPLIB_IOC;
+        } else {
+            result = fp16_infinity(sgn);
+        }
+    } else if (!mnt) {
+        result = fp16_zero(sgn);
+    } else {
+        result = fp16_round_(sgn, exp - 127 + 15,
+                             mnt >> 7 | !!(uint32_t)(mnt << 25),
+                             rounding, mode | alt_hp << 4, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint16_t
+fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt;
+    uint16_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    bool alt_hp = fpscr.ahp;
+
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        if (alt_hp) {
+            result = fp16_zero(sgn);
+        } else if (fpscr.dn) {
+            result = fp16_defaultNaN();
+        } else {
+            result = fp16_FPConvertNaN_64(op);
+        }
+        if (!(mnt >> 51 & 1) || alt_hp) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 2047) {
+        if (alt_hp) {
+            result = sgn << 15 | (uint16_t)0x7fff;
+            flags |= FPLIB_IOC;
+        } else {
+            result = fp16_infinity(sgn);
+        }
+    } else if (!mnt) {
+        result = fp16_zero(sgn);
+    } else {
+        result = fp16_round_(sgn, exp - 1023 + 15,
+                             mnt >> 36 | !!(uint64_t)(mnt << 28),
+                             rounding, mode | alt_hp << 4, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint16_t mnt;
+    uint32_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 31 && !fpscr.ahp && (uint16_t)(mnt << 6)) {
+        if (fpscr.dn) {
+            result = fp32_defaultNaN();
+        } else {
+            result = fp32_FPConvertNaN_16(op);
+        }
+        if (!(mnt >> 9 & 1)) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 31 && !fpscr.ahp) {
+        result = fp32_infinity(sgn);
+    } else if (!mnt) {
+        result = fp32_zero(sgn);
+    } else {
+        mnt = fp16_normalise(mnt, &exp);
+        result = fp32_pack(sgn, exp - 15 + 127 + 5, (uint32_t)mnt << 8);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt;
+    uint32_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        if (fpscr.dn) {
+            result = fp32_defaultNaN();
+        } else {
+            result = fp32_FPConvertNaN_64(op);
+        }
+        if (!(mnt >> 51 & 1)) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 2047) {
+        result = fp32_infinity(sgn);
+    } else if (!mnt) {
+        result = fp32_zero(sgn);
+    } else {
+        result = fp32_round_(sgn, exp - 1023 + 127,
+                             mnt >> 20 | !!(uint64_t)(mnt << 44),
+                             rounding, mode, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint16_t mnt;
+    uint64_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 31 && !fpscr.ahp && (uint16_t)(mnt << 6)) {
+        if (fpscr.dn) {
+            result = fp64_defaultNaN();
+        } else {
+            result = fp64_FPConvertNaN_16(op);
+        }
+        if (!(mnt >> 9 & 1)) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 31 && !fpscr.ahp) {
+        result = fp64_infinity(sgn);
+    } else if (!mnt) {
+        result = fp64_zero(sgn);
+    } else {
+        mnt = fp16_normalise(mnt, &exp);
+        result = fp64_pack(sgn, exp - 15 + 1023 + 5, (uint64_t)mnt << 37);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt;
+    uint64_t result;
+
+    // Unpack floating-point operand optionally with flush-to-zero:
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        if (fpscr.dn) {
+            result = fp64_defaultNaN();
+        } else {
+            result = fp64_FPConvertNaN_32(op);
+        }
+        if (!(mnt >> 22 & 1)) {
+            flags |= FPLIB_IOC;
+        }
+    } else if (exp == 255) {
+        result = fp64_infinity(sgn);
+    } else if (!mnt) {
+        result = fp64_zero(sgn);
+    } else {
+        mnt = fp32_normalise(mnt, &exp);
+        result = fp64_pack(sgn, exp - 127 + 1023 + 8, (uint64_t)mnt << 21);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibMulAdd(uint32_t addend, uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_muladd(addend, op1, op2, 0, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibMulAdd(uint64_t addend, uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_muladd(addend, op1, op2, 0, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint32_t
+fplibDiv(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_div(op1, op2, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibDiv(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_div(op1, op2, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+static uint32_t
+fp32_repack(int sgn, int exp, uint32_t mnt)
+{
+    return fp32_pack(sgn, mnt >> 23 ? exp : 0, mnt);
+}
+
+static uint64_t
+fp64_repack(int sgn, int exp, uint64_t mnt)
+{
+    return fp64_pack(sgn, mnt >> 52 ? exp : 0, mnt);
+}
+
+static void
+fp32_minmaxnum(uint32_t *op1, uint32_t *op2, int sgn)
+{
+    // Treat a single quiet-NaN as +Infinity/-Infinity
+    if (!((uint32_t)~(*op1 << 1) >> 23) && (uint32_t)~(*op2 << 1) >> 23)
+        *op1 = fp32_infinity(sgn);
+    if (!((uint32_t)~(*op2 << 1) >> 23) && (uint32_t)~(*op1 << 1) >> 23)
+        *op2 = fp32_infinity(sgn);
+}
+
+static void
+fp64_minmaxnum(uint64_t *op1, uint64_t *op2, int sgn)
+{
+    // Treat a single quiet-NaN as +Infinity/-Infinity
+    if (!((uint64_t)~(*op1 << 1) >> 52) && (uint64_t)~(*op2 << 1) >> 52)
+        *op1 = fp64_infinity(sgn);
+    if (!((uint64_t)~(*op2 << 1) >> 52) && (uint64_t)~(*op1 << 1) >> 52)
+        *op2 = fp64_infinity(sgn);
+}
+
+template <>
+uint32_t
+fplibMax(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint32_t mnt1, mnt2, x, result;
+
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((x = fp32_process_NaNs(op1, op2, mode, &flags))) {
+        result = x;
+    } else {
+        result = ((sgn1 != sgn2 ? sgn2 : sgn1 ^ (op1 > op2)) ?
+                  fp32_repack(sgn1, exp1, mnt1) :
+                  fp32_repack(sgn2, exp2, mnt2));
+    }
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibMax(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint64_t mnt1, mnt2, x, result;
+
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((x = fp64_process_NaNs(op1, op2, mode, &flags))) {
+        result = x;
+    } else {
+        result = ((sgn1 != sgn2 ? sgn2 : sgn1 ^ (op1 > op2)) ?
+                  fp64_repack(sgn1, exp1, mnt1) :
+                  fp64_repack(sgn2, exp2, mnt2));
+    }
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint32_t
+fplibMaxNum(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    fp32_minmaxnum(&op1, &op2, 1);
+    return fplibMax<uint32_t>(op1, op2, fpscr);
+}
+
+template <>
+uint64_t
+fplibMaxNum(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    fp64_minmaxnum(&op1, &op2, 1);
+    return fplibMax<uint64_t>(op1, op2, fpscr);
+}
+
+template <>
+uint32_t
+fplibMin(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint32_t mnt1, mnt2, x, result;
+
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((x = fp32_process_NaNs(op1, op2, mode, &flags))) {
+        result = x;
+    } else {
+        result = ((sgn1 != sgn2 ? sgn1 : sgn1 ^ (op1 < op2)) ?
+                  fp32_repack(sgn1, exp1, mnt1) :
+                  fp32_repack(sgn2, exp2, mnt2));
+    }
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibMin(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint64_t mnt1, mnt2, x, result;
+
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    if ((x = fp64_process_NaNs(op1, op2, mode, &flags))) {
+        result = x;
+    } else {
+        result = ((sgn1 != sgn2 ? sgn1 : sgn1 ^ (op1 < op2)) ?
+                  fp64_repack(sgn1, exp1, mnt1) :
+                  fp64_repack(sgn2, exp2, mnt2));
+    }
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint32_t
+fplibMinNum(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    fp32_minmaxnum(&op1, &op2, 0);
+    return fplibMin<uint32_t>(op1, op2, fpscr);
+}
+
+template <>
+uint64_t
+fplibMinNum(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    fp64_minmaxnum(&op1, &op2, 0);
+    return fplibMin<uint64_t>(op1, op2, fpscr);
+}
+
+template <>
+uint32_t
+fplibMul(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_mul(op1, op2, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibMul(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_mul(op1, op2, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint32_t
+fplibMulX(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint32_t mnt1, mnt2, result;
+
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp32_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) {
+            result = fp32_FPTwo(sgn1 ^ sgn2);
+        } else if (exp1 == 255 || exp2 == 255) {
+            result = fp32_infinity(sgn1 ^ sgn2);
+        } else if (!mnt1 || !mnt2) {
+            result = fp32_zero(sgn1 ^ sgn2);
+        } else {
+            result = fp32_mul(op1, op2, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibMulX(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint64_t mnt1, mnt2, result;
+
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp64_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) {
+            result = fp64_FPTwo(sgn1 ^ sgn2);
+        } else if (exp1 == 2047 || exp2 == 2047) {
+            result = fp64_infinity(sgn1 ^ sgn2);
+        } else if (!mnt1 || !mnt2) {
+            result = fp64_zero(sgn1 ^ sgn2);
+        } else {
+            result = fp64_mul(op1, op2, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibNeg(uint32_t op)
+{
+    return op ^ (uint32_t)1 << 31;
+}
+
+template <>
+uint64_t
+fplibNeg(uint64_t op)
+{
+    return op ^ (uint64_t)1 << 63;
+}
+
+static const uint8_t recip_sqrt_estimate[256] = {
+    255, 253, 251, 249, 247, 245, 243, 242, 240, 238, 236, 234, 233, 231, 229, 228,
+    226, 224, 223, 221, 219, 218, 216, 215, 213, 212, 210, 209, 207, 206, 204, 203,
+    201, 200, 198, 197, 196, 194, 193, 192, 190, 189, 188, 186, 185, 184, 183, 181,
+    180, 179, 178, 176, 175, 174, 173, 172, 170, 169, 168, 167, 166, 165, 164, 163,
+    162, 160, 159, 158, 157, 156, 155, 154, 153, 152, 151, 150, 149, 148, 147, 146,
+    145, 144, 143, 142, 141, 140, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131,
+    131, 130, 129, 128, 127, 126, 126, 125, 124, 123, 122, 121, 121, 120, 119, 118,
+    118, 117, 116, 115, 114, 114, 113, 112, 111, 111, 110, 109, 109, 108, 107, 106,
+    105, 104, 103, 101, 100,  99,  97,  96,  95,  93,  92,  91,  90,  88,  87,  86,
+    85,  84,  82,  81,  80,  79,  78,  77,  76,  75,  74,  72,  71,  70,  69,  68,
+    67,  66,  65,  64,  63,  62,  61,  60,  60,  59,  58,  57,  56,  55,  54,  53,
+    52,  51,  51,  50,  49,  48,  47,  46,  46,  45,  44,  43,  42,  42,  41,  40,
+    39,  38,  38,  37,  36,  35,  35,  34,  33,  33,  32,  31,  30,  30,  29,  28,
+    28,  27,  26,  26,  25,  24,  24,  23,  22,  22,  21,  20,  20,  19,  19,  18,
+    17,  17,  16,  16,  15,  14,  14,  13,  13,  12,  11,  11,  10,  10,   9,   9,
+    8,   8,   7,   6,   6,   5,   5,   4,   4,   3,   3,   2,   2,   1,   1,   0
+};
+
+template <>
+uint32_t
+fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt, result;
+
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        result = fp32_process_NaN(op, mode, &flags);
+    } else if (!mnt) {
+        result = fp32_infinity(sgn);
+        flags |= FPLIB_DZC;
+    } else if (sgn) {
+        result = fp32_defaultNaN();
+        flags |= FPLIB_IOC;
+    } else if (exp == 255) {
+        result = fp32_zero(0);
+    } else {
+        exp += 8;
+        mnt = fp32_normalise(mnt, &exp);
+        mnt = recip_sqrt_estimate[(~exp & 1) << 7 | (mnt >> 24 & 127)];
+        result = fp32_pack(0, (380 - exp) >> 1, mnt << 15);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRSqrtEstimate(uint64_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt, result;
+
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        result = fp64_process_NaN(op, mode, &flags);
+    } else if (!mnt) {
+        result = fp64_infinity(sgn);
+        flags |= FPLIB_DZC;
+    } else if (sgn) {
+        result = fp64_defaultNaN();
+        flags |= FPLIB_IOC;
+    } else if (exp == 2047) {
+        result = fp32_zero(0);
+    } else {
+        exp += 11;
+        mnt = fp64_normalise(mnt, &exp);
+        mnt = recip_sqrt_estimate[(~exp & 1) << 7 | (mnt >> 56 & 127)];
+        result = fp64_pack(0, (3068 - exp) >> 1, mnt << 44);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibRSqrtStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint32_t mnt1, mnt2, result;
+
+    op1 = fplibNeg<uint32_t>(op1);
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp32_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) {
+            result = fp32_FPOnePointFive(0);
+        } else if (exp1 == 255 || exp2 == 255) {
+            result = fp32_infinity(sgn1 ^ sgn2);
+        } else {
+            result = fp32_muladd(fp32_FPThree(0), op1, op2, -1, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRSqrtStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint64_t mnt1, mnt2, result;
+
+    op1 = fplibNeg<uint64_t>(op1);
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp64_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) {
+            result = fp64_FPOnePointFive(0);
+        } else if (exp1 == 2047 || exp2 == 2047) {
+            result = fp64_infinity(sgn1 ^ sgn2);
+        } else {
+            result = fp64_muladd(fp64_FPThree(0), op1, op2, -1, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibRecipStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint32_t mnt1, mnt2, result;
+
+    op1 = fplibNeg<uint32_t>(op1);
+    fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp32_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) {
+            result = fp32_FPTwo(0);
+        } else if (exp1 == 255 || exp2 == 255) {
+            result = fp32_infinity(sgn1 ^ sgn2);
+        } else {
+            result = fp32_muladd(fp32_FPTwo(0), op1, op2, 0, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibRecipEstimate(uint32_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt, result;
+
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        result = fp32_process_NaN(op, mode, &flags);
+    } else if (exp == 255) {
+        result = fp32_zero(sgn);
+    } else if (!mnt) {
+        result = fp32_infinity(sgn);
+        flags |= FPLIB_DZC;
+    } else if (!((uint32_t)(op << 1) >> 22)) {
+        bool overflow_to_inf;
+        switch (FPCRRounding(fpscr)) {
+          case FPRounding_TIEEVEN:
+            overflow_to_inf = true;
+            break;
+          case FPRounding_POSINF:
+            overflow_to_inf = !sgn;
+            break;
+          case FPRounding_NEGINF:
+            overflow_to_inf = sgn;
+            break;
+          case FPRounding_ZERO:
+            overflow_to_inf = false;
+            break;
+          default:
+            assert(0);
+        }
+        result = overflow_to_inf ? fp32_infinity(sgn) : fp32_max_normal(sgn);
+        flags |= FPLIB_OFC | FPLIB_IXC;
+    } else if (fpscr.fz && exp >= 253) {
+        result = fp32_zero(sgn);
+        flags |= FPLIB_UFC;
+    } else {
+        exp += 8;
+        mnt = fp32_normalise(mnt, &exp);
+        int result_exp = 253 - exp;
+        uint32_t fraction = (((uint32_t)1 << 19) / (mnt >> 22 | 1) + 1) >> 1;
+        fraction <<= 15;
+        if (result_exp == 0) {
+            fraction >>= 1;
+        } else if (result_exp == -1) {
+            fraction >>= 2;
+            result_exp = 0;
+        }
+        result = fp32_pack(sgn, result_exp, fraction);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRecipEstimate(uint64_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt, result;
+
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        result = fp64_process_NaN(op, mode, &flags);
+    } else if (exp == 2047) {
+        result = fp64_zero(sgn);
+    } else if (!mnt) {
+        result = fp64_infinity(sgn);
+        flags |= FPLIB_DZC;
+    } else if (!((uint64_t)(op << 1) >> 51)) {
+        bool overflow_to_inf;
+        switch (FPCRRounding(fpscr)) {
+          case FPRounding_TIEEVEN:
+            overflow_to_inf = true;
+            break;
+          case FPRounding_POSINF:
+            overflow_to_inf = !sgn;
+            break;
+          case FPRounding_NEGINF:
+            overflow_to_inf = sgn;
+            break;
+          case FPRounding_ZERO:
+            overflow_to_inf = false;
+            break;
+          default:
+            assert(0);
+        }
+        result = overflow_to_inf ? fp64_infinity(sgn) : fp64_max_normal(sgn);
+        flags |= FPLIB_OFC | FPLIB_IXC;
+    } else if (fpscr.fz && exp >= 2045) {
+        result = fp64_zero(sgn);
+        flags |= FPLIB_UFC;
+    } else {
+        exp += 11;
+        mnt = fp64_normalise(mnt, &exp);
+        int result_exp = 2045 - exp;
+        uint64_t fraction = (((uint32_t)1 << 19) / (mnt >> 54 | 1) + 1) >> 1;
+        fraction <<= 44;
+        if (result_exp == 0) {
+            fraction >>= 1;
+        } else if (result_exp == -1) {
+            fraction >>= 2;
+            result_exp = 0;
+        }
+        result = fp64_pack(sgn, result_exp, fraction);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn1, exp1, sgn2, exp2;
+    uint64_t mnt1, mnt2, result;
+
+    op1 = fplibNeg<uint64_t>(op1);
+    fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags);
+    fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags);
+
+    result = fp64_process_NaNs(op1, op2, mode, &flags);
+    if (!result) {
+        if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) {
+            result = fp64_FPTwo(0);
+        } else if (exp1 == 2047 || exp2 == 2047) {
+            result = fp64_infinity(sgn1 ^ sgn2);
+        } else {
+            result = fp64_muladd(fp64_FPTwo(0), op1, op2, 0, mode, &flags);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibRecpX(uint32_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt, result;
+
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        result = fp32_process_NaN(op, mode, &flags);
+    }
+    else {
+        if (!mnt) { // Zero and denormals
+            result = fp32_pack(sgn, 254, 0);
+        } else { // Infinities and normals
+            result = fp32_pack(sgn, exp ^ 255, 0);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRecpX(uint64_t op, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt, result;
+
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        result = fp64_process_NaN(op, mode, &flags);
+    }
+    else {
+        if (!mnt) { // Zero and denormals
+            result = fp64_pack(sgn, 2046, 0);
+        } else { // Infinities and normals
+            result = fp64_pack(sgn, exp ^ 2047, 0);
+        }
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibRoundInt(uint32_t op, FPRounding rounding, bool exact, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt, result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    // Handle NaNs, infinities and zeroes:
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        result = fp32_process_NaN(op, mode, &flags);
+    } else if (exp == 255) {
+        result = fp32_infinity(sgn);
+    } else if (!mnt) {
+        result = fp32_zero(sgn);
+    } else if (exp >= 150) {
+        // There are no fractional bits
+        result = op;
+    } else {
+        // Truncate towards zero:
+        uint32_t x = 150 - exp >= 32 ? 0 : mnt >> (150 - exp);
+        int err = exp < 118 ? 1 :
+            (mnt << 1 >> (149 - exp) & 3) | (mnt << 2 << (exp - 118) != 0);
+        switch (rounding) {
+          case FPRounding_TIEEVEN:
+            x += (err == 3 || (err == 2 && (x & 1)));
+            break;
+          case FPRounding_POSINF:
+            x += err && !sgn;
+            break;
+          case FPRounding_NEGINF:
+            x += err && sgn;
+            break;
+          case FPRounding_ZERO:
+            break;
+          case FPRounding_TIEAWAY:
+            x += err >> 1;
+            break;
+          default:
+            assert(0);
+        }
+
+        if (x == 0) {
+            result = fp32_zero(sgn);
+        } else {
+            exp = 150;
+            mnt = fp32_normalise(x, &exp);
+            result = fp32_pack(sgn, exp + 8, mnt >> 8);
+        }
+
+        if (err && exact)
+            flags |= FPLIB_IXC;
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr)
+{
+    int mode = modeConv(fpscr);
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt, result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags);
+
+    // Handle NaNs, infinities and zeroes:
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        result = fp64_process_NaN(op, mode, &flags);
+    } else if (exp == 2047) {
+        result = fp64_infinity(sgn);
+    } else if (!mnt) {
+        result = fp64_zero(sgn);
+    } else if (exp >= 1075) {
+        // There are no fractional bits
+        result = op;
+    } else {
+        // Truncate towards zero:
+        uint64_t x = 1075 - exp >= 64 ? 0 : mnt >> (1075 - exp);
+        int err = exp < 1011 ? 1 :
+            (mnt << 1 >> (1074 - exp) & 3) | (mnt << 2 << (exp - 1011) != 0);
+        switch (rounding) {
+          case FPRounding_TIEEVEN:
+            x += (err == 3 || (err == 2 && (x & 1)));
+            break;
+          case FPRounding_POSINF:
+            x += err && !sgn;
+            break;
+          case FPRounding_NEGINF:
+            x += err && sgn;
+            break;
+          case FPRounding_ZERO:
+            break;
+          case FPRounding_TIEAWAY:
+            x += err >> 1;
+            break;
+          default:
+            assert(0);
+        }
+
+        if (x == 0) {
+            result = fp64_zero(sgn);
+        } else {
+            exp = 1075;
+            mnt = fp64_normalise(x, &exp);
+            result = fp64_pack(sgn, exp + 11, mnt >> 11);
+        }
+
+        if (err && exact)
+            flags |= FPLIB_IXC;
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibSqrt(uint32_t op, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_sqrt(op, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibSqrt(uint64_t op, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_sqrt(op, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint32_t
+fplibSub(uint32_t op1, uint32_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t result = fp32_add(op1, op2, 1, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+template <>
+uint64_t
+fplibSub(uint64_t op1, uint64_t op2, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t result = fp64_add(op1, op2, 1, modeConv(fpscr), &flags);
+    set_fpscr0(fpscr, flags);
+    return result;
+}
+
+static uint64_t
+FPToFixed_64(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding,
+             int *flags)
+{
+    uint64_t x;
+    int err;
+
+    if (exp > 1023 + 63) {
+        *flags = FPLIB_IOC;
+        return ((uint64_t)!u << 63) - !sgn;
+    }
+
+    x = lsr64(mnt << 11, 1023 + 63 - exp);
+    err = (exp > 1023 + 63 - 2 ? 0 :
+           (lsr64(mnt << 11, 1023 + 63 - 2 - exp) & 3) |
+           !!(mnt << 11 & (lsl64(1, 1023 + 63 - 2 - exp) - 1)));
+
+    switch (rounding) {
+      case FPRounding_TIEEVEN:
+        x += (err == 3 || (err == 2 && (x & 1)));
+        break;
+      case FPRounding_POSINF:
+        x += err && !sgn;
+        break;
+      case FPRounding_NEGINF:
+        x += err && sgn;
+        break;
+      case FPRounding_ZERO:
+        break;
+      case FPRounding_TIEAWAY:
+        x += err >> 1;
+        break;
+      default:
+        assert(0);
+    }
+
+    if (u ? sgn && x : x > ((uint64_t)1 << 63) - !sgn) {
+        *flags = FPLIB_IOC;
+        return ((uint64_t)!u << 63) - !sgn;
+    }
+
+    if (err) {
+        *flags = FPLIB_IXC;
+    }
+
+    return sgn ? -x : x;
+}
+
+static uint32_t
+FPToFixed_32(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding,
+             int *flags)
+{
+    uint64_t x = FPToFixed_64(sgn, exp, mnt, u, rounding, flags);
+    if (u ? x >= (uint64_t)1 << 32 :
+        !(x < (uint64_t)1 << 31 ||
+          (uint64_t)-x <= (uint64_t)1 << 31)) {
+        *flags = FPLIB_IOC;
+        x = ((uint32_t)!u << 31) - !sgn;
+    }
+    return x;
+}
+
+template <>
+uint32_t
+fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt, result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp32_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags);
+
+    // If NaN, set cumulative flag or take exception:
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        flags = FPLIB_IOC;
+        result = 0;
+    } else {
+        result = FPToFixed_32(sgn, exp + 1023 - 127 + fbits,
+                              (uint64_t)mnt << (52 - 23), u, rounding, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint32_t
+fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt;
+    uint32_t result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp64_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags);
+
+    // If NaN, set cumulative flag or take exception:
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        flags = FPLIB_IOC;
+        result = 0;
+    } else {
+        result = FPToFixed_32(sgn, exp + fbits, mnt, u, rounding, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    int sgn, exp;
+    uint32_t mnt;
+    uint64_t result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp32_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags);
+
+    // If NaN, set cumulative flag or take exception:
+    if (exp == 255 && (uint32_t)(mnt << 9)) {
+        flags = FPLIB_IOC;
+        result = 0;
+    } else {
+        result = FPToFixed_64(sgn, exp + 1023 - 127 + fbits,
+                              (uint64_t)mnt << (52 - 23), u, rounding, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+template <>
+uint64_t
+fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    int sgn, exp;
+    uint64_t mnt, result;
+
+    // Unpack using FPCR to determine if subnormals are flushed-to-zero:
+    fp64_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags);
+
+    // If NaN, set cumulative flag or take exception:
+    if (exp == 2047 && (uint64_t)(mnt << 12)) {
+        flags = FPLIB_IOC;
+        result = 0;
+    } else {
+        result = FPToFixed_64(sgn, exp + fbits, mnt, u, rounding, &flags);
+    }
+
+    set_fpscr0(fpscr, flags);
+
+    return result;
+}
+
+static uint32_t
+fp32_cvtf(uint64_t a, int fbits, int u, int mode, int *flags)
+{
+    int x_sgn = !u && a >> 63;
+    int x_exp = 190 - fbits;
+    uint64_t x_mnt = x_sgn ? -a : a;
+
+    // Handle zero:
+    if (!x_mnt) {
+        return fp32_zero(0);
+    }
+
+    // Normalise and convert to 32 bits, collapsing error into bottom bit:
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+    x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1);
+
+    return fp32_round(x_sgn, x_exp, x_mnt, mode, flags);
+}
+
+static uint64_t
+fp64_cvtf(uint64_t a, int fbits, int u, int mode, int *flags)
+{
+    int x_sgn = !u && a >> 63;
+    int x_exp = 1024 + 62 - fbits;
+    uint64_t x_mnt = x_sgn ? -a : a;
+
+    // Handle zero:
+    if (!x_mnt) {
+        return fp64_zero(0);
+    }
+
+    x_mnt = fp64_normalise(x_mnt, &x_exp);
+
+    return fp64_round(x_sgn, x_exp, x_mnt << 1, mode, flags);
+}
+
+template <>
+uint32_t
+fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint32_t res = fp32_cvtf(op, fbits, u,
+                             (int)rounding | ((uint32_t)fpscr >> 22 & 12),
+                             &flags);
+    set_fpscr0(fpscr, flags);
+    return res;
+}
+
+template <>
+uint64_t
+fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr)
+{
+    int flags = 0;
+    uint64_t res = fp64_cvtf(op, fbits, u,
+                             (int)rounding | ((uint32_t)fpscr >> 22 & 12),
+                             &flags);
+    set_fpscr0(fpscr, flags);
+    return res;
+}
+
+}
diff --git a/src/arch/arm/insts/fplib.hh b/src/arch/arm/insts/fplib.hh
new file mode 100644
index 000000000..6263687fc
--- /dev/null
+++ b/src/arch/arm/insts/fplib.hh
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2012-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Edmund Grimley Evans
+ *          Thomas Grocutt
+ */
+
+/**
+ * @file
+ * Floating-point library code, which will gradually replace vfp.hh. For
+ * portability, this library does not use floating-point data types.  Currently,
+ * C's standard integer types are used in the API, though this could be changed
+ * to something like class Fp32 { uint32_t x; }, etc.
+ */
+
+#ifndef __ARCH_ARM_INSTS_FPLIB_HH__
+#define __ARCH_ARM_INSTS_FPLIB_HH__
+
+#include <stdint.h>
+
+#include "arch/arm/miscregs.hh"
+
+namespace ArmISA
+{
+
+enum FPRounding {
+    FPRounding_TIEEVEN = 0,
+    FPRounding_POSINF = 1,
+    FPRounding_NEGINF = 2,
+    FPRounding_ZERO = 3,
+    FPRounding_TIEAWAY = 4,
+    FPRounding_ODD = 5
+};
+
+static inline FPRounding
+FPCRRounding(FPSCR &fpscr)
+{
+    return (FPRounding)((uint32_t)fpscr >> 22 & 3);
+}
+
+/** Floating-point absolute value. */
+template <class T>
+T fplibAbs(T op);
+/** Floating-point add. */
+template <class T>
+T fplibAdd(T op1, T op2, FPSCR &fpscr);
+/** Floating-point compare (quiet and signaling). */
+template <class T>
+int fplibCompare(T op1, T op2, bool signal_nans, FPSCR &fpscr);
+/** Floating-point compare equal. */
+template <class T>
+bool fplibCompareEQ(T op1, T op2, FPSCR &fpscr);
+/** Floating-point compare greater than or equal. */
+template <class T>
+bool fplibCompareGE(T op1, T op2, FPSCR &fpscr);
+/** Floating-point compare greater than. */
+template <class T>
+bool fplibCompareGT(T op1, T op2, FPSCR &fpscr);
+/** Floating-point convert precision. */
+template <class T1, class T2>
+T2 fplibConvert(T1 op, FPRounding rounding, FPSCR &fpscr);
+/** Floating-point division. */
+template <class T>
+T fplibDiv(T op1, T op2, FPSCR &fpscr);
+/** Floating-point maximum. */
+template <class T>
+T fplibMax(T op1, T op2, FPSCR &fpscr);
+/** Floating-point maximum number. */
+template <class T>
+T fplibMaxNum(T op1, T op2, FPSCR &fpscr);
+/** Floating-point minimum. */
+template <class T>
+T fplibMin(T op1, T op2, FPSCR &fpscr);
+/** Floating-point minimum number. */
+template <class T>
+T fplibMinNum(T op1, T op2, FPSCR &fpscr);
+/** Floating-point multiply. */
+template <class T>
+T fplibMul(T op1, T op2, FPSCR &fpscr);
+/** Floating-point multiply-add. */
+template <class T>
+T fplibMulAdd(T addend, T op1, T op2, FPSCR &fpscr);
+/** Floating-point multiply extended. */
+template <class T>
+T fplibMulX(T op1, T op2, FPSCR &fpscr);
+/** Floating-point negate. */
+template <class T>
+T fplibNeg(T op);
+/** Floating-point reciprocal square root estimate. */
+template <class T>
+T fplibRSqrtEstimate(T op, FPSCR &fpscr);
+/** Floating-point reciprocal square root step. */
+template <class T>
+T fplibRSqrtStepFused(T op1, T op2, FPSCR &fpscr);
+/** Floating-point reciprocal estimate. */
+template <class T>
+T fplibRecipEstimate(T op, FPSCR &fpscr);
+/** Floating-point reciprocal step. */
+template <class T>
+T fplibRecipStepFused(T op1, T op2, FPSCR &fpscr);
+/** Floating-point reciprocal exponent. */
+template <class T>
+T fplibRecpX(T op, FPSCR &fpscr);
+/**  Floating-point convert to integer. */
+template <class T>
+T fplibRoundInt(T op, FPRounding rounding, bool exact, FPSCR &fpscr);
+/** Floating-point square root. */
+template <class T>
+T fplibSqrt(T op, FPSCR &fpscr);
+/** Floating-point subtract. */
+template <class T>
+T fplibSub(T op1, T op2, FPSCR &fpscr);
+/** Floating-point convert to fixed-point. */
+template <class T1, class T2>
+T2 fplibFPToFixed(T1 op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr);
+/** Floating-point convert from fixed-point. */
+template <class T>
+T fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding,
+                 FPSCR &fpscr);
+
+/* Function specializations... */
+template <>
+uint32_t fplibAbs(uint32_t op);
+template <>
+uint64_t fplibAbs(uint64_t op);
+template <>
+uint32_t fplibAdd(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibAdd(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+int fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr);
+template <>
+int fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr);
+template <>
+bool fplibCompareEQ(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+bool fplibCompareEQ(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+bool fplibCompareGE(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+bool fplibCompareGE(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+bool fplibCompareGT(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+bool fplibCompareGT(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint16_t fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint32_t fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint32_t fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint64_t fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint64_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr);
+template <>
+uint32_t fplibDiv(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibDiv(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMax(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMax(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMaxNum(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMaxNum(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMin(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMin(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMinNum(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMinNum(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMul(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMul(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibMulAdd(uint32_t addend, uint32_t op1, uint32_t op2,
+                     FPSCR &fpscr);
+template <>
+uint64_t fplibMulAdd(uint64_t addend, uint64_t op1, uint64_t op2,
+                     FPSCR &fpscr);
+template <>
+uint32_t fplibMulX(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibMulX(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibNeg(uint32_t op);
+template <>
+uint64_t fplibNeg(uint64_t op);
+template <>
+uint32_t fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr);
+template<>
+uint64_t fplibRSqrtEstimate(uint64_t op, FPSCR &fpscr);
+template <>
+uint32_t fplibRSqrtStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibRSqrtStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibRecipEstimate(uint32_t op, FPSCR &fpscr);
+template <>
+uint64_t fplibRecipEstimate(uint64_t op, FPSCR &fpscr);
+template <>
+uint32_t fplibRecipStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibRecpX(uint32_t op, FPSCR &fpscr);
+template <>
+uint64_t fplibRecpX(uint64_t op, FPSCR &fpscr);
+template <>
+uint32_t fplibRoundInt(uint32_t op, FPRounding rounding, bool exact,
+                       FPSCR &fpscr);
+template <>
+uint64_t fplibRoundInt(uint64_t op, FPRounding rounding, bool exact,
+                       FPSCR &fpscr);
+template <>
+uint32_t fplibSqrt(uint32_t op, FPSCR &fpscr);
+template <>
+uint64_t fplibSqrt(uint64_t op, FPSCR &fpscr);
+template <>
+uint32_t fplibSub(uint32_t op1, uint32_t op2, FPSCR &fpscr);
+template <>
+uint64_t fplibSub(uint64_t op1, uint64_t op2, FPSCR &fpscr);
+template <>
+uint32_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+template <>
+uint32_t fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+template <>
+uint64_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+template <>
+uint64_t fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+template <>
+uint32_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+template <>
+uint64_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding,
+                        FPSCR &fpscr);
+}
+
+#endif
diff --git a/src/arch/arm/insts/macromem.cc b/src/arch/arm/insts/macromem.cc
index 26a916fc7..42cb98a7c 100644
--- a/src/arch/arm/insts/macromem.cc
+++ b/src/arch/arm/insts/macromem.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -43,7 +43,9 @@
 #include <sstream>
 
 #include "arch/arm/insts/macromem.hh"
+
 #include "arch/arm/generated/decoder.hh"
+#include "arch/arm/insts/neon64_mem.hh"
 
 using namespace std;
 using namespace ArmISAInst;
@@ -177,6 +179,212 @@ MacroMemOp::MacroMemOp(const char *mnem, ExtMachInst machInst,
     }
 }
 
+PairMemOp::PairMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                     uint32_t size, bool fp, bool load, bool noAlloc,
+                     bool signExt, bool exclusive, bool acrel,
+                     int64_t imm, AddrMode mode,
+                     IntRegIndex rn, IntRegIndex rt, IntRegIndex rt2) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    bool writeback = (mode != AddrMd_Offset);
+    numMicroops = 1 + (size / 4) + (writeback ? 1 : 0);
+    microOps = new StaticInstPtr[numMicroops];
+
+    StaticInstPtr *uop = microOps;
+
+    bool post = (mode == AddrMd_PostIndex);
+
+    rn = makeSP(rn);
+
+    *uop = new MicroAddXiSpAlignUop(machInst, INTREG_UREG0, rn, post ? 0 : imm);
+
+    if (fp) {
+        if (size == 16) {
+            if (load) {
+                *++uop = new MicroLdrQBFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroLdrQTFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroLdrQBFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+                *++uop = new MicroLdrQTFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+            } else {
+                *++uop = new MicroStrQBFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroStrQTFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroStrQBFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+                *++uop = new MicroStrQTFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 16, noAlloc, exclusive, acrel);
+            }
+        } else if (size == 8) {
+            if (load) {
+                *++uop = new MicroLdrFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroLdrFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 8, noAlloc, exclusive, acrel);
+            } else {
+                *++uop = new MicroStrFpXImmUop(machInst, rt,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                *++uop = new MicroStrFpXImmUop(machInst, rt2,
+                        INTREG_UREG0, 8, noAlloc, exclusive, acrel);
+            }
+        } else if (size == 4) {
+            if (load) {
+                *++uop = new MicroLdrDFpXImmUop(machInst, rt, rt2,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+            } else {
+                *++uop = new MicroStrDFpXImmUop(machInst, rt, rt2,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+            }
+        }
+    } else {
+        if (size == 8) {
+            if (load) {
+                *++uop = new MicroLdrXImmUop(machInst, rt, INTREG_UREG0,
+                        0, noAlloc, exclusive, acrel);
+                *++uop = new MicroLdrXImmUop(machInst, rt2, INTREG_UREG0,
+                        size, noAlloc, exclusive, acrel);
+            } else {
+                *++uop = new MicroStrXImmUop(machInst, rt, INTREG_UREG0,
+                        0, noAlloc, exclusive, acrel);
+                *++uop = new MicroStrXImmUop(machInst, rt2, INTREG_UREG0,
+                        size, noAlloc, exclusive, acrel);
+            }
+        } else if (size == 4) {
+            if (load) {
+                if (signExt) {
+                    *++uop = new MicroLdrDSXImmUop(machInst, rt, rt2,
+                            INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                } else {
+                    *++uop = new MicroLdrDUXImmUop(machInst, rt, rt2,
+                            INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+                }
+            } else {
+                *++uop = new MicroStrDXImmUop(machInst, rt, rt2,
+                        INTREG_UREG0, 0, noAlloc, exclusive, acrel);
+            }
+        }
+    }
+
+    if (writeback) {
+        *++uop = new MicroAddXiUop(machInst, rn, INTREG_UREG0,
+                                   post ? imm : 0);
+    }
+
+    (*uop)->setLastMicroop();
+
+    for (StaticInstPtr *curUop = microOps;
+            !(*curUop)->isLastMicroop(); curUop++) {
+        (*curUop)->setDelayedCommit();
+    }
+}
+
+BigFpMemImmOp::BigFpMemImmOp(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, bool load, IntRegIndex dest,
+                             IntRegIndex base, int64_t imm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    numMicroops = 2;
+    microOps = new StaticInstPtr[numMicroops];
+
+    if (load) {
+        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, imm);
+        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, imm);
+    } else {
+        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
+        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
+    }
+    microOps[0]->setDelayedCommit();
+    microOps[1]->setLastMicroop();
+}
+
+BigFpMemPostOp::BigFpMemPostOp(const char *mnem, ExtMachInst machInst,
+                               OpClass __opClass, bool load, IntRegIndex dest,
+                               IntRegIndex base, int64_t imm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    numMicroops = 3;
+    microOps = new StaticInstPtr[numMicroops];
+
+    if (load) {
+        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, 0);
+        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, 0);
+    } else {
+        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, 0);
+        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, 0);
+    }
+    microOps[2] = new MicroAddXiUop(machInst, base, base, imm);
+
+    microOps[0]->setDelayedCommit();
+    microOps[1]->setDelayedCommit();
+    microOps[2]->setLastMicroop();
+}
+
+BigFpMemPreOp::BigFpMemPreOp(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, bool load, IntRegIndex dest,
+                             IntRegIndex base, int64_t imm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    numMicroops = 3;
+    microOps = new StaticInstPtr[numMicroops];
+
+    if (load) {
+        microOps[0] = new MicroLdrQBFpXImmUop(machInst, dest, base, imm);
+        microOps[1] = new MicroLdrQTFpXImmUop(machInst, dest, base, imm);
+    } else {
+        microOps[0] = new MicroStrQBFpXImmUop(machInst, dest, base, imm);
+        microOps[1] = new MicroStrQTFpXImmUop(machInst, dest, base, imm);
+    }
+    microOps[2] = new MicroAddXiUop(machInst, base, base, imm);
+
+    microOps[0]->setDelayedCommit();
+    microOps[1]->setDelayedCommit();
+    microOps[2]->setLastMicroop();
+}
+
+BigFpMemRegOp::BigFpMemRegOp(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, bool load, IntRegIndex dest,
+                             IntRegIndex base, IntRegIndex offset,
+                             ArmExtendType type, int64_t imm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    numMicroops = 2;
+    microOps = new StaticInstPtr[numMicroops];
+
+    if (load) {
+        microOps[0] = new MicroLdrQBFpXRegUop(machInst, dest, base,
+                                              offset, type, imm);
+        microOps[1] = new MicroLdrQTFpXRegUop(machInst, dest, base,
+                                              offset, type, imm);
+    } else {
+        microOps[0] = new MicroStrQBFpXRegUop(machInst, dest, base,
+                                              offset, type, imm);
+        microOps[1] = new MicroStrQTFpXRegUop(machInst, dest, base,
+                                              offset, type, imm);
+    }
+
+    microOps[0]->setDelayedCommit();
+    microOps[1]->setLastMicroop();
+}
+
+BigFpMemLitOp::BigFpMemLitOp(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, IntRegIndex dest,
+                             int64_t imm) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    numMicroops = 2;
+    microOps = new StaticInstPtr[numMicroops];
+
+    microOps[0] = new MicroLdrQBFpXLitUop(machInst, dest, imm);
+    microOps[1] = new MicroLdrQTFpXLitUop(machInst, dest, imm);
+
+    microOps[0]->setDelayedCommit();
+    microOps[1]->setLastMicroop();
+}
+
 VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
                      unsigned elems, RegIndex rn, RegIndex vd, unsigned regs,
                      unsigned inc, uint32_t size, uint32_t align, RegIndex rm) :
@@ -193,7 +401,7 @@ VldMultOp::VldMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
     if (deinterleave) numMicroops += (regs / elems);
     microOps = new StaticInstPtr[numMicroops];
 
-    RegIndex rMid = deinterleave ? NumFloatArchRegs : vd * 2;
+    RegIndex rMid = deinterleave ? NumFloatV7ArchRegs : vd * 2;
 
     uint32_t noAlign = TLB::MustBeOne;
 
@@ -295,7 +503,7 @@ VldSingleOp::VldSingleOp(const char *mnem, ExtMachInst machInst,
     numMicroops += (regs / elems);
     microOps = new StaticInstPtr[numMicroops];
 
-    RegIndex ufp0 = NumFloatArchRegs;
+    RegIndex ufp0 = NumFloatV7ArchRegs;
 
     unsigned uopIdx = 0;
     switch (loadSize) {
@@ -556,7 +764,7 @@ VstMultOp::VstMultOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
 
     uint32_t noAlign = TLB::MustBeOne;
 
-    RegIndex rMid = interleave ? NumFloatArchRegs : vd * 2;
+    RegIndex rMid = interleave ? NumFloatV7ArchRegs : vd * 2;
 
     unsigned uopIdx = 0;
     if (interleave) {
@@ -657,7 +865,7 @@ VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst,
     numMicroops += (regs / elems);
     microOps = new StaticInstPtr[numMicroops];
 
-    RegIndex ufp0 = NumFloatArchRegs;
+    RegIndex ufp0 = NumFloatV7ArchRegs;
 
     unsigned uopIdx = 0;
     switch (elems) {
@@ -834,6 +1042,285 @@ VstSingleOp::VstSingleOp(const char *mnem, ExtMachInst machInst,
     microOps[numMicroops - 1]->setLastMicroop();
 }
 
+VldMultOp64::VldMultOp64(const char *mnem, ExtMachInst machInst,
+                         OpClass __opClass, RegIndex rn, RegIndex vd,
+                         RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                         uint8_t numStructElems, uint8_t numRegs, bool wb) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    RegIndex vx = NumFloatV8ArchRegs / 4;
+    RegIndex rnsp = (RegIndex) makeSP((IntRegIndex) rn);
+    bool baseIsSP = isSP((IntRegIndex) rnsp);
+
+    numMicroops = wb ? 1 : 0;
+
+    int totNumBytes = numRegs * dataSize / 8;
+    assert(totNumBytes <= 64);
+
+    // The guiding principle here is that no more than 16 bytes can be
+    // transferred at a time
+    int numMemMicroops = totNumBytes / 16;
+    int residuum = totNumBytes % 16;
+    if (residuum)
+        ++numMemMicroops;
+    numMicroops += numMemMicroops;
+
+    int numMarshalMicroops = numRegs / 2 + (numRegs % 2 ? 1 : 0);
+    numMicroops += numMarshalMicroops;
+
+    microOps = new StaticInstPtr[numMicroops];
+    unsigned uopIdx = 0;
+    uint32_t memaccessFlags = TLB::MustBeOne | (TLB::ArmFlags) eSize |
+        TLB::AllowUnaligned;
+
+    int i = 0;
+    for(; i < numMemMicroops - 1; ++i) {
+        microOps[uopIdx++] = new MicroNeonLoad64(
+            machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags,
+            baseIsSP, 16 /* accSize */, eSize);
+    }
+    microOps[uopIdx++] =  new MicroNeonLoad64(
+        machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags, baseIsSP,
+        residuum ? residuum : 16 /* accSize */, eSize);
+
+    // Writeback microop: the post-increment amount is encoded in "Rm": a
+    // 64-bit general register OR as '11111' for an immediate value equal to
+    // the total number of bytes transferred (i.e. 8, 16, 24, 32, 48 or 64)
+    if (wb) {
+        if (rm != ((RegIndex) INTREG_X31)) {
+            microOps[uopIdx++] = new MicroAddXERegUop(machInst, rnsp, rnsp, rm,
+                                                      UXTX, 0);
+        } else {
+            microOps[uopIdx++] = new MicroAddXiUop(machInst, rnsp, rnsp,
+                                                   totNumBytes);
+        }
+    }
+
+    for (int i = 0; i < numMarshalMicroops; ++i) {
+        microOps[uopIdx++] = new MicroDeintNeon64(
+            machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+            numStructElems, numRegs, i /* step */);
+    }
+
+    assert(uopIdx == numMicroops);
+
+    for (int i = 0; i < numMicroops - 1; ++i) {
+        microOps[i]->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VstMultOp64::VstMultOp64(const char *mnem, ExtMachInst machInst,
+                         OpClass __opClass, RegIndex rn, RegIndex vd,
+                         RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                         uint8_t numStructElems, uint8_t numRegs, bool wb) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    RegIndex vx = NumFloatV8ArchRegs / 4;
+    RegIndex rnsp = (RegIndex) makeSP((IntRegIndex) rn);
+    bool baseIsSP = isSP((IntRegIndex) rnsp);
+
+    numMicroops = wb ? 1 : 0;
+
+    int totNumBytes = numRegs * dataSize / 8;
+    assert(totNumBytes <= 64);
+
+    // The guiding principle here is that no more than 16 bytes can be
+    // transferred at a time
+    int numMemMicroops = totNumBytes / 16;
+    int residuum = totNumBytes % 16;
+    if (residuum)
+        ++numMemMicroops;
+    numMicroops += numMemMicroops;
+
+    int numMarshalMicroops = totNumBytes > 32 ? 2 : 1;
+    numMicroops += numMarshalMicroops;
+
+    microOps = new StaticInstPtr[numMicroops];
+    unsigned uopIdx = 0;
+
+    for(int i = 0; i < numMarshalMicroops; ++i) {
+        microOps[uopIdx++] = new MicroIntNeon64(
+            machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+            numStructElems, numRegs, i /* step */);
+    }
+
+    uint32_t memaccessFlags = TLB::MustBeOne | (TLB::ArmFlags) eSize |
+        TLB::AllowUnaligned;
+
+    int i = 0;
+    for(; i < numMemMicroops - 1; ++i) {
+        microOps[uopIdx++] = new MicroNeonStore64(
+            machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags,
+            baseIsSP, 16 /* accSize */, eSize);
+    }
+    microOps[uopIdx++] = new MicroNeonStore64(
+        machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags, baseIsSP,
+        residuum ? residuum : 16 /* accSize */, eSize);
+
+    // Writeback microop: the post-increment amount is encoded in "Rm": a
+    // 64-bit general register OR as '11111' for an immediate value equal to
+    // the total number of bytes transferred (i.e. 8, 16, 24, 32, 48 or 64)
+    if (wb) {
+        if (rm != ((RegIndex) INTREG_X31)) {
+            microOps[uopIdx++] = new MicroAddXERegUop(machInst, rnsp, rnsp, rm,
+                                                      UXTX, 0);
+        } else {
+            microOps[uopIdx++] = new MicroAddXiUop(machInst, rnsp, rnsp,
+                                                   totNumBytes);
+        }
+    }
+
+    assert(uopIdx == numMicroops);
+
+    for (int i = 0; i < numMicroops - 1; i++) {
+        microOps[i]->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VldSingleOp64::VldSingleOp64(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, RegIndex rn, RegIndex vd,
+                             RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                             uint8_t numStructElems, uint8_t index, bool wb,
+                             bool replicate) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    RegIndex vx = NumFloatV8ArchRegs / 4;
+    RegIndex rnsp = (RegIndex) makeSP((IntRegIndex) rn);
+    bool baseIsSP = isSP((IntRegIndex) rnsp);
+
+    numMicroops = wb ? 1 : 0;
+
+    int eSizeBytes = 1 << eSize;
+    int totNumBytes = numStructElems * eSizeBytes;
+    assert(totNumBytes <= 64);
+
+    // The guiding principle here is that no more than 16 bytes can be
+    // transferred at a time
+    int numMemMicroops = totNumBytes / 16;
+    int residuum = totNumBytes % 16;
+    if (residuum)
+        ++numMemMicroops;
+    numMicroops += numMemMicroops;
+
+    int numMarshalMicroops = numStructElems / 2 + (numStructElems % 2 ? 1 : 0);
+    numMicroops += numMarshalMicroops;
+
+    microOps = new StaticInstPtr[numMicroops];
+    unsigned uopIdx = 0;
+
+    uint32_t memaccessFlags = TLB::MustBeOne | (TLB::ArmFlags) eSize |
+        TLB::AllowUnaligned;
+
+    int i = 0;
+    for (; i < numMemMicroops - 1; ++i) {
+        microOps[uopIdx++] = new MicroNeonLoad64(
+            machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags,
+            baseIsSP, 16 /* accSize */, eSize);
+    }
+    microOps[uopIdx++] = new MicroNeonLoad64(
+        machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags, baseIsSP,
+        residuum ? residuum : 16 /* accSize */, eSize);
+
+    // Writeback microop: the post-increment amount is encoded in "Rm": a
+    // 64-bit general register OR as '11111' for an immediate value equal to
+    // the total number of bytes transferred (i.e. 8, 16, 24, 32, 48 or 64)
+    if (wb) {
+        if (rm != ((RegIndex) INTREG_X31)) {
+            microOps[uopIdx++] = new MicroAddXERegUop(machInst, rnsp, rnsp, rm,
+                                                      UXTX, 0);
+        } else {
+            microOps[uopIdx++] = new MicroAddXiUop(machInst, rnsp, rnsp,
+                                                   totNumBytes);
+        }
+    }
+
+    for(int i = 0; i < numMarshalMicroops; ++i) {
+        microOps[uopIdx++] = new MicroUnpackNeon64(
+            machInst, vd + (RegIndex) (2 * i), vx, eSize, dataSize,
+            numStructElems, index, i /* step */, replicate);
+    }
+
+    assert(uopIdx == numMicroops);
+
+    for (int i = 0; i < numMicroops - 1; i++) {
+        microOps[i]->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
+VstSingleOp64::VstSingleOp64(const char *mnem, ExtMachInst machInst,
+                             OpClass __opClass, RegIndex rn, RegIndex vd,
+                             RegIndex rm, uint8_t eSize, uint8_t dataSize,
+                             uint8_t numStructElems, uint8_t index, bool wb,
+                             bool replicate) :
+    PredMacroOp(mnem, machInst, __opClass)
+{
+    RegIndex vx = NumFloatV8ArchRegs / 4;
+    RegIndex rnsp = (RegIndex) makeSP((IntRegIndex) rn);
+    bool baseIsSP = isSP((IntRegIndex) rnsp);
+
+    numMicroops = wb ? 1 : 0;
+
+    int eSizeBytes = 1 << eSize;
+    int totNumBytes = numStructElems * eSizeBytes;
+    assert(totNumBytes <= 64);
+
+    // The guiding principle here is that no more than 16 bytes can be
+    // transferred at a time
+    int numMemMicroops = totNumBytes / 16;
+    int residuum = totNumBytes % 16;
+    if (residuum)
+        ++numMemMicroops;
+    numMicroops += numMemMicroops;
+
+    int numMarshalMicroops = totNumBytes > 32 ? 2 : 1;
+    numMicroops += numMarshalMicroops;
+
+    microOps = new StaticInstPtr[numMicroops];
+    unsigned uopIdx = 0;
+
+    for(int i = 0; i < numMarshalMicroops; ++i) {
+        microOps[uopIdx++] = new MicroPackNeon64(
+            machInst, vx + (RegIndex) (2 * i), vd, eSize, dataSize,
+            numStructElems, index, i /* step */, replicate);
+    }
+
+    uint32_t memaccessFlags = TLB::MustBeOne | (TLB::ArmFlags) eSize |
+        TLB::AllowUnaligned;
+
+    int i = 0;
+    for(; i < numMemMicroops - 1; ++i) {
+        microOps[uopIdx++] = new MicroNeonStore64(
+            machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags,
+            baseIsSP, 16 /* accsize */, eSize);
+    }
+    microOps[uopIdx++] = new MicroNeonStore64(
+        machInst, vx + (RegIndex) i, rnsp, 16 * i, memaccessFlags, baseIsSP,
+        residuum ? residuum : 16 /* accSize */, eSize);
+
+    // Writeback microop: the post-increment amount is encoded in "Rm": a
+    // 64-bit general register OR as '11111' for an immediate value equal to
+    // the total number of bytes transferred (i.e. 8, 16, 24, 32, 48 or 64)
+    if (wb) {
+        if (rm != ((RegIndex) INTREG_X31)) {
+            microOps[uopIdx++] = new MicroAddXERegUop(machInst, rnsp, rnsp, rm,
+                                                      UXTX, 0);
+        } else {
+            microOps[uopIdx++] = new MicroAddXiUop(machInst, rnsp, rnsp,
+                                                   totNumBytes);
+        }
+    }
+
+    assert(uopIdx == numMicroops);
+
+    for (int i = 0; i < numMicroops - 1; i++) {
+        microOps[i]->setDelayedCommit();
+    }
+    microOps[numMicroops - 1]->setLastMicroop();
+}
+
 MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst,
                              OpClass __opClass, IntRegIndex rn,
                              RegIndex vd, bool single, bool up,
@@ -846,14 +1333,14 @@ MacroVFPMemOp::MacroVFPMemOp(const char *mnem, ExtMachInst machInst,
     // to be functionally identical except that fldmx is deprecated. For now
     // we'll assume they're otherwise interchangable.
     int count = (single ? offset : (offset / 2));
-    if (count == 0 || count > NumFloatArchRegs)
+    if (count == 0 || count > NumFloatV7ArchRegs)
         warn_once("Bad offset field for VFP load/store multiple.\n");
     if (count == 0) {
         // Force there to be at least one microop so the macroop makes sense.
         writeback = true;
     }
-    if (count > NumFloatArchRegs)
-        count = NumFloatArchRegs;
+    if (count > NumFloatV7ArchRegs)
+        count = NumFloatV7ArchRegs;
 
     numMicroops = count * (single ? 1 : 2) + (writeback ? 1 : 0);
     microOps = new StaticInstPtr[numMicroops];
@@ -934,6 +1421,19 @@ MicroIntImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+MicroIntImmXOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, ura);
+    ss << ", ";
+    printReg(ss, urb);
+    ss << ", ";
+    ccprintf(ss, "#%d", imm);
+    return ss.str();
+}
+
+std::string
 MicroSetPCCPSR::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
@@ -943,6 +1443,18 @@ MicroSetPCCPSR::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+MicroIntRegXOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, ura);
+    ccprintf(ss, ", ");
+    printReg(ss, urb);
+    printExtendOperand(false, ss, (IntRegIndex)urc, type, shiftAmt);
+    return ss.str();
+}
+
+std::string
 MicroIntMov::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
diff --git a/src/arch/arm/insts/macromem.hh b/src/arch/arm/insts/macromem.hh
index 4933a1e7c..fc8e3e1b7 100644
--- a/src/arch/arm/insts/macromem.hh
+++ b/src/arch/arm/insts/macromem.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -85,6 +85,27 @@ class MicroOp : public PredOp
     }
 };
 
+class MicroOpX : public ArmStaticInst
+{
+  protected:
+    MicroOpX(const char *mnem, ExtMachInst machInst, OpClass __opClass)
+            : ArmStaticInst(mnem, machInst, __opClass)
+    {}
+
+  public:
+    void
+    advancePC(PCState &pcState) const
+    {
+        if (flags[IsLastMicroop]) {
+            pcState.uEnd();
+        } else if (flags[IsMicroop]) {
+            pcState.uAdvance();
+        } else {
+            pcState.advance();
+        }
+    }
+};
+
 /**
  * Microops for Neon loads/stores
  */
@@ -136,6 +157,96 @@ class MicroNeonMixLaneOp : public MicroNeonMixOp
 };
 
 /**
+ * Microops for AArch64 NEON load/store (de)interleaving
+ */
+class MicroNeonMixOp64 : public MicroOp
+{
+  protected:
+    RegIndex dest, op1;
+    uint8_t eSize, dataSize, numStructElems, numRegs, step;
+
+    MicroNeonMixOp64(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                     RegIndex _dest, RegIndex _op1, uint8_t _eSize,
+                     uint8_t _dataSize, uint8_t _numStructElems,
+                     uint8_t _numRegs, uint8_t _step)
+        : MicroOp(mnem, machInst, __opClass), dest(_dest), op1(_op1),
+          eSize(_eSize), dataSize(_dataSize), numStructElems(_numStructElems),
+          numRegs(_numRegs), step(_step)
+    {
+    }
+};
+
+class MicroNeonMixLaneOp64 : public MicroOp
+{
+  protected:
+    RegIndex dest, op1;
+    uint8_t eSize, dataSize, numStructElems, lane, step;
+    bool replicate;
+
+    MicroNeonMixLaneOp64(const char *mnem, ExtMachInst machInst,
+                         OpClass __opClass, RegIndex _dest, RegIndex _op1,
+                         uint8_t _eSize, uint8_t _dataSize,
+                         uint8_t _numStructElems, uint8_t _lane, uint8_t _step,
+                         bool _replicate = false)
+        : MicroOp(mnem, machInst, __opClass), dest(_dest), op1(_op1),
+          eSize(_eSize), dataSize(_dataSize), numStructElems(_numStructElems),
+          lane(_lane), step(_step), replicate(_replicate)
+    {
+    }
+};
+
+/**
+ * Base classes for microcoded AArch64 NEON memory instructions.
+ */
+class VldMultOp64 : public PredMacroOp
+{
+  protected:
+    uint8_t eSize, dataSize, numStructElems, numRegs;
+    bool wb;
+
+    VldMultOp64(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                RegIndex rn, RegIndex vd, RegIndex rm, uint8_t eSize,
+                uint8_t dataSize, uint8_t numStructElems, uint8_t numRegs,
+                bool wb);
+};
+
+class VstMultOp64 : public PredMacroOp
+{
+  protected:
+    uint8_t eSize, dataSize, numStructElems, numRegs;
+    bool wb;
+
+    VstMultOp64(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                RegIndex rn, RegIndex vd, RegIndex rm, uint8_t eSize,
+                uint8_t dataSize, uint8_t numStructElems, uint8_t numRegs,
+                bool wb);
+};
+
+class VldSingleOp64 : public PredMacroOp
+{
+  protected:
+    uint8_t eSize, dataSize, numStructElems, index;
+    bool wb, replicate;
+
+    VldSingleOp64(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  RegIndex rn, RegIndex vd, RegIndex rm, uint8_t eSize,
+                  uint8_t dataSize, uint8_t numStructElems, uint8_t index,
+                  bool wb, bool replicate = false);
+};
+
+class VstSingleOp64 : public PredMacroOp
+{
+  protected:
+    uint8_t eSize, dataSize, numStructElems, index;
+    bool wb, replicate;
+
+    VstSingleOp64(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  RegIndex rn, RegIndex vd, RegIndex rm, uint8_t eSize,
+                  uint8_t dataSize, uint8_t numStructElems, uint8_t index,
+                  bool wb, bool replicate = false);
+};
+
+/**
  * Microops of the form
  * PC   = IntRegA
  * CPSR = IntRegB
@@ -180,10 +291,10 @@ class MicroIntImmOp : public MicroOp
 {
   protected:
     RegIndex ura, urb;
-    uint32_t imm;
+    int32_t imm;
 
     MicroIntImmOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
-                  RegIndex _ura, RegIndex _urb, uint32_t _imm)
+                  RegIndex _ura, RegIndex _urb, int32_t _imm)
             : MicroOp(mnem, machInst, __opClass),
               ura(_ura), urb(_urb), imm(_imm)
     {
@@ -192,6 +303,22 @@ class MicroIntImmOp : public MicroOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class MicroIntImmXOp : public MicroOpX
+{
+  protected:
+    RegIndex ura, urb;
+    int64_t imm;
+
+    MicroIntImmXOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                   RegIndex _ura, RegIndex _urb, int64_t _imm)
+            : MicroOpX(mnem, machInst, __opClass),
+              ura(_ura), urb(_urb), imm(_imm)
+    {
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 /**
  * Microops of the form IntRegA = IntRegB op IntRegC
  */
@@ -210,6 +337,25 @@ class MicroIntOp : public MicroOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class MicroIntRegXOp : public MicroOp
+{
+  protected:
+    RegIndex ura, urb, urc;
+    ArmExtendType type;
+    uint32_t shiftAmt;
+
+    MicroIntRegXOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                   RegIndex _ura, RegIndex _urb, RegIndex _urc,
+                   ArmExtendType _type, uint32_t _shiftAmt)
+            : MicroOp(mnem, machInst, __opClass),
+              ura(_ura), urb(_urb), urc(_urc),
+              type(_type), shiftAmt(_shiftAmt)
+    {
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 /**
  * Microops of the form IntRegA = IntRegB op shifted IntRegC
  */
@@ -261,6 +407,61 @@ class MacroMemOp : public PredMacroOp
 };
 
 /**
+ * Base class for pair load/store instructions.
+ */
+class PairMemOp : public PredMacroOp
+{
+  public:
+    enum AddrMode {
+        AddrMd_Offset,
+        AddrMd_PreIndex,
+        AddrMd_PostIndex
+    };
+
+  protected:
+    PairMemOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+              uint32_t size, bool fp, bool load, bool noAlloc, bool signExt,
+              bool exclusive, bool acrel, int64_t imm, AddrMode mode,
+              IntRegIndex rn, IntRegIndex rt, IntRegIndex rt2);
+};
+
+class BigFpMemImmOp : public PredMacroOp
+{
+  protected:
+    BigFpMemImmOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  bool load, IntRegIndex dest, IntRegIndex base, int64_t imm);
+};
+
+class BigFpMemPostOp : public PredMacroOp
+{
+  protected:
+    BigFpMemPostOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                   bool load, IntRegIndex dest, IntRegIndex base, int64_t imm);
+};
+
+class BigFpMemPreOp : public PredMacroOp
+{
+  protected:
+    BigFpMemPreOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  bool load, IntRegIndex dest, IntRegIndex base, int64_t imm);
+};
+
+class BigFpMemRegOp : public PredMacroOp
+{
+  protected:
+    BigFpMemRegOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  bool load, IntRegIndex dest, IntRegIndex base,
+                  IntRegIndex offset, ArmExtendType type, int64_t imm);
+};
+
+class BigFpMemLitOp : public PredMacroOp
+{
+  protected:
+    BigFpMemLitOp(const char *mnem, ExtMachInst machInst, OpClass __opClass,
+                  IntRegIndex dest, int64_t imm);
+};
+
+/**
  * Base classes for microcoded integer memory instructions.
  */
 class VldMultOp : public PredMacroOp
diff --git a/src/arch/arm/insts/mem.cc b/src/arch/arm/insts/mem.cc
index 552803b6a..15702ff83 100644
--- a/src/arch/arm/insts/mem.cc
+++ b/src/arch/arm/insts/mem.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010, 2012 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -157,6 +157,9 @@ SrsOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
       case MODE_ABORT:
         ss << "abort";
         break;
+      case MODE_HYP:
+        ss << "hyp";
+        break;
       case MODE_UNDEFINED:
         ss << "undefined";
         break;
diff --git a/src/arch/arm/insts/mem64.cc b/src/arch/arm/insts/mem64.cc
new file mode 100644
index 000000000..4d1fdd302
--- /dev/null
+++ b/src/arch/arm/insts/mem64.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#include "arch/arm/insts/mem64.hh"
+#include "arch/arm/tlb.hh"
+#include "base/loader/symtab.hh"
+#include "mem/request.hh"
+
+using namespace std;
+
+namespace ArmISA
+{
+
+std::string
+SysDC64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    ccprintf(ss, ", [");
+    printReg(ss, base);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+
+
+void
+Memory64::startDisassembly(std::ostream &os) const
+{
+    printMnemonic(os, "", false);
+    printReg(os, dest);
+    ccprintf(os, ", [");
+    printReg(os, base);
+}
+
+void
+Memory64::setExcAcRel(bool exclusive, bool acrel)
+{
+    if (exclusive)
+        memAccessFlags |= Request::LLSC;
+    else
+        memAccessFlags |= ArmISA::TLB::AllowUnaligned;
+    if (acrel) {
+        flags[IsMemBarrier] = true;
+        flags[IsWriteBarrier] = true;
+        flags[IsReadBarrier] = true;
+    }
+}
+
+std::string
+MemoryImm64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    startDisassembly(ss);
+    if (imm)
+        ccprintf(ss, ", #%d", imm);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryDImm64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, dest2);
+    ccprintf(ss, ", [");
+    printReg(ss, base);
+    if (imm)
+        ccprintf(ss, ", #%d", imm);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryDImmEx64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, result);
+    ccprintf(ss, ", ");
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, dest2);
+    ccprintf(ss, ", [");
+    printReg(ss, base);
+    if (imm)
+        ccprintf(ss, ", #%d", imm);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryPreIndex64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    startDisassembly(ss);
+    ccprintf(ss, ", #%d]!", imm);
+    return ss.str();
+}
+
+std::string
+MemoryPostIndex64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    startDisassembly(ss);
+    if (imm)
+        ccprintf(ss, "], #%d", imm);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryReg64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    startDisassembly(ss);
+    printExtendOperand(false, ss, offset, type, shiftAmt);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryRaw64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    startDisassembly(ss);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryEx64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, result);
+    ccprintf(ss, ", [");
+    printReg(ss, base);
+    ccprintf(ss, "]");
+    return ss.str();
+}
+
+std::string
+MemoryLiteral64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", #%d", pc + imm);
+    return ss.str();
+}
+}
diff --git a/src/arch/arm/insts/mem64.hh b/src/arch/arm/insts/mem64.hh
new file mode 100644
index 000000000..21c1e1ea8
--- /dev/null
+++ b/src/arch/arm/insts/mem64.hh
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+#ifndef __ARCH_ARM_MEM64_HH__
+#define __ARCH_ARM_MEM64_HH__
+
+#include "arch/arm/insts/static_inst.hh"
+
+namespace ArmISA
+{
+
+class SysDC64 : public ArmStaticInst
+{
+  protected:
+    IntRegIndex base;
+    IntRegIndex dest;
+    uint64_t imm;
+
+    SysDC64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+            IntRegIndex _base, IntRegIndex _dest, uint64_t _imm)
+        : ArmStaticInst(mnem, _machInst, __opClass), base(_base), dest(_dest),
+        imm(_imm)
+    {}
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MightBeMicro64 : public ArmStaticInst
+{
+  protected:
+    MightBeMicro64(const char *mnem, ExtMachInst _machInst, OpClass __opClass)
+        : ArmStaticInst(mnem, _machInst, __opClass)
+    {}
+
+    void
+    advancePC(PCState &pcState) const
+    {
+        if (flags[IsLastMicroop]) {
+            pcState.uEnd();
+        } else if (flags[IsMicroop]) {
+            pcState.uAdvance();
+        } else {
+            pcState.advance();
+        }
+    }
+};
+
+class Memory64 : public MightBeMicro64
+{
+  public:
+    enum AddrMode {
+        AddrMd_Offset,
+        AddrMd_PreIndex,
+        AddrMd_PostIndex
+    };
+
+  protected:
+
+    IntRegIndex dest;
+    IntRegIndex base;
+    /// True if the base register is SP (used for SP alignment checking).
+    bool baseIsSP;
+    static const unsigned numMicroops = 3;
+
+    StaticInstPtr *uops;
+
+    Memory64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+             IntRegIndex _dest, IntRegIndex _base)
+        : MightBeMicro64(mnem, _machInst, __opClass),
+          dest(_dest), base(_base), uops(NULL)
+    {
+        baseIsSP = isSP(_base);
+    }
+
+    virtual
+    ~Memory64()
+    {
+        delete [] uops;
+    }
+
+    StaticInstPtr
+    fetchMicroop(MicroPC microPC) const
+    {
+        assert(uops != NULL && microPC < numMicroops);
+        return uops[microPC];
+    }
+
+    void startDisassembly(std::ostream &os) const;
+
+    unsigned memAccessFlags;
+
+    void setExcAcRel(bool exclusive, bool acrel);
+};
+
+class MemoryImm64 : public Memory64
+{
+  protected:
+    int64_t imm;
+
+    MemoryImm64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _base, int64_t _imm)
+        : Memory64(mnem, _machInst, __opClass, _dest, _base), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryDImm64 : public MemoryImm64
+{
+  protected:
+    IntRegIndex dest2;
+
+    MemoryDImm64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _dest2, IntRegIndex _base,
+                int64_t _imm)
+        : MemoryImm64(mnem, _machInst, __opClass, _dest, _base, _imm),
+          dest2(_dest2)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryDImmEx64 : public MemoryDImm64
+{
+  protected:
+    IntRegIndex result;
+
+    MemoryDImmEx64(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                 IntRegIndex _result, IntRegIndex _dest, IntRegIndex _dest2,
+                 IntRegIndex _base, int32_t _imm)
+        : MemoryDImm64(mnem, _machInst, __opClass, _dest, _dest2,
+                     _base, _imm), result(_result)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryPreIndex64 : public MemoryImm64
+{
+  protected:
+    MemoryPreIndex64(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, IntRegIndex _dest, IntRegIndex _base,
+                     int64_t _imm)
+        : MemoryImm64(mnem, _machInst, __opClass, _dest, _base, _imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryPostIndex64 : public MemoryImm64
+{
+  protected:
+    MemoryPostIndex64(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, IntRegIndex _dest, IntRegIndex _base,
+                      int64_t _imm)
+        : MemoryImm64(mnem, _machInst, __opClass, _dest, _base, _imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryReg64 : public Memory64
+{
+  protected:
+    IntRegIndex offset;
+    ArmExtendType type;
+    uint64_t shiftAmt;
+
+    MemoryReg64(const char *mnem, ExtMachInst _machInst,
+                OpClass __opClass, IntRegIndex _dest, IntRegIndex _base,
+                IntRegIndex _offset, ArmExtendType _type,
+                uint64_t _shiftAmt)
+        : Memory64(mnem, _machInst, __opClass, _dest, _base),
+          offset(_offset), type(_type), shiftAmt(_shiftAmt)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryRaw64 : public Memory64
+{
+  protected:
+    MemoryRaw64(const char *mnem, ExtMachInst _machInst,
+                OpClass __opClass, IntRegIndex _dest, IntRegIndex _base)
+        : Memory64(mnem, _machInst, __opClass, _dest, _base)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryEx64 : public Memory64
+{
+  protected:
+    IntRegIndex result;
+
+    MemoryEx64(const char *mnem, ExtMachInst _machInst,
+               OpClass __opClass, IntRegIndex _dest, IntRegIndex _base,
+               IntRegIndex _result)
+        : Memory64(mnem, _machInst, __opClass, _dest, _base), result(_result)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class MemoryLiteral64 : public Memory64
+{
+  protected:
+    int64_t imm;
+
+    MemoryLiteral64(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, IntRegIndex _dest, int64_t _imm)
+        : Memory64(mnem, _machInst, __opClass, _dest, INTREG_ZERO), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+}
+
+#endif //__ARCH_ARM_INSTS_MEM_HH__
diff --git a/src/arch/arm/insts/misc.cc b/src/arch/arm/insts/misc.cc
index 6320bb6da..efc334c4b 100644
--- a/src/arch/arm/insts/misc.cc
+++ b/src/arch/arm/insts/misc.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010, 2012-2013 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -146,6 +146,32 @@ MsrRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+MrrcOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest);
+    ss << ", ";
+    printReg(ss, dest2);
+    ss << ", ";
+    printReg(ss, op1);
+    return ss.str();
+}
+
+std::string
+McrrOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest);
+    ss << ", ";
+    printReg(ss, op1);
+    ss << ", ";
+    printReg(ss, op2);
+    return ss.str();
+}
+
+std::string
 ImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
@@ -230,6 +256,16 @@ RegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+RegImmImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest);
+    ccprintf(ss, ", #%d, #%d", imm1, imm2);
+    return ss.str();
+}
+
+std::string
 RegRegImmImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
diff --git a/src/arch/arm/insts/misc.hh b/src/arch/arm/insts/misc.hh
index c9e114f85..3d947a272 100644
--- a/src/arch/arm/insts/misc.hh
+++ b/src/arch/arm/insts/misc.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010, 2012-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -94,6 +94,42 @@ class MsrRegOp : public MsrBase
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class MrrcOp : public PredOp
+{
+  protected:
+    IntRegIndex op1;
+    IntRegIndex dest;
+    IntRegIndex dest2;
+    uint32_t    imm;
+
+    MrrcOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+           IntRegIndex _op1, IntRegIndex _dest, IntRegIndex _dest2,
+           uint32_t _imm) :
+        PredOp(mnem, _machInst, __opClass), op1(_op1), dest(_dest),
+        dest2(_dest2), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class McrrOp : public PredOp
+{
+  protected:
+    IntRegIndex op1;
+    IntRegIndex op2;
+    IntRegIndex dest;
+    uint32_t    imm;
+
+    McrrOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+           IntRegIndex _op1, IntRegIndex _op2, IntRegIndex _dest,
+           uint32_t _imm) :
+        PredOp(mnem, _machInst, __opClass), op1(_op1), op2(_op2),
+        dest(_dest), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 class ImmOp : public PredOp
 {
   protected:
@@ -220,6 +256,23 @@ class RegRegImmOp : public PredOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class RegImmImmOp : public PredOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    uint64_t imm1;
+    uint64_t imm2;
+
+    RegImmImmOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, uint64_t _imm1, uint64_t _imm2) :
+        PredOp(mnem, _machInst, __opClass),
+        dest(_dest), imm1(_imm1), imm2(_imm2)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 class RegRegImmImmOp : public PredOp
 {
   protected:
diff --git a/src/arch/arm/insts/misc64.cc b/src/arch/arm/insts/misc64.cc
new file mode 100644
index 000000000..3553020da
--- /dev/null
+++ b/src/arch/arm/insts/misc64.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#include "arch/arm/insts/misc64.hh"
+
+std::string
+RegRegImmImmOp64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ss << ", ";
+    printReg(ss, op1);
+    ccprintf(ss, ", #%d, #%d", imm1, imm2);
+    return ss.str();
+}
+
+std::string
+RegRegRegImmOp64::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ss << ", ";
+    printReg(ss, op1);
+    ss << ", ";
+    printReg(ss, op2);
+    ccprintf(ss, ", #%d", imm);
+    return ss.str();
+}
+
+std::string
+UnknownOp64::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    return csprintf("%-10s (inst %#08x)", "unknown", machInst);
+}
diff --git a/src/arch/arm/insts/misc64.hh b/src/arch/arm/insts/misc64.hh
new file mode 100644
index 000000000..5a0e18224
--- /dev/null
+++ b/src/arch/arm/insts/misc64.hh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2011-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#ifndef __ARCH_ARM_INSTS_MISC64_HH__
+#define __ARCH_ARM_INSTS_MISC64_HH__
+
+#include "arch/arm/insts/static_inst.hh"
+
+class RegRegImmImmOp64 : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    uint64_t imm1;
+    uint64_t imm2;
+
+    RegRegImmImmOp64(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, IntRegIndex _dest, IntRegIndex _op1,
+                     uint64_t _imm1, uint64_t _imm2) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), imm1(_imm1), imm2(_imm2)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class RegRegRegImmOp64 : public ArmStaticInst
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    IntRegIndex op2;
+    uint64_t imm;
+
+    RegRegRegImmOp64(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, IntRegIndex _dest, IntRegIndex _op1,
+                     IntRegIndex _op2, uint64_t _imm) :
+        ArmStaticInst(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), imm(_imm)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class UnknownOp64 : public ArmStaticInst
+{
+  protected:
+
+    UnknownOp64(const char *mnem, ExtMachInst _machInst, OpClass __opClass) :
+        ArmStaticInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+#endif
diff --git a/src/arch/arm/insts/neon64_mem.hh b/src/arch/arm/insts/neon64_mem.hh
new file mode 100644
index 000000000..01ce1b624
--- /dev/null
+++ b/src/arch/arm/insts/neon64_mem.hh
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2012-2013 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mbou Eyole
+ *          Giacomo Gabrielli
+ */
+
+/// @file
+/// Utility functions and datatypes used by AArch64 NEON memory instructions.
+
+#ifndef __ARCH_ARM_INSTS_NEON64_MEM_HH__
+#define __ARCH_ARM_INSTS_NEON64_MEM_HH__
+
+namespace ArmISA
+{
+
+typedef uint64_t XReg;
+
+/// 128-bit NEON vector register.
+struct VReg {
+    XReg hi;
+    XReg lo;
+};
+
+/// Write a single NEON vector element leaving the others untouched.
+inline void
+writeVecElem(VReg *dest, XReg src, int index, int eSize)
+{
+    // eSize must be less than 4:
+    // 0 -> 8-bit elems,
+    // 1 -> 16-bit elems,
+    // 2 -> 32-bit elems,
+    // 3 -> 64-bit elems
+    assert(eSize <= 3);
+
+    int eBits = 8 << eSize;
+    int lsbPos = index * eBits;
+    assert(lsbPos < 128);
+    int shiftAmt = lsbPos % 64;
+
+    XReg maskBits = -1;
+    if (eBits == 64) {
+        maskBits = 0;
+    } else {
+        maskBits = maskBits << eBits;
+    }
+    maskBits = ~maskBits;
+
+    XReg sMask = maskBits;
+    maskBits = sMask << shiftAmt;
+
+    if (lsbPos < 64) {
+        dest->lo = (dest->lo & (~maskBits)) | ((src & sMask) << shiftAmt);
+    } else {
+        dest->hi = (dest->hi & (~maskBits)) | ((src & sMask) << shiftAmt);
+    }
+}
+
+/// Read a single NEON vector element.
+inline XReg
+readVecElem(VReg src, int index, int eSize)
+{
+    // eSize must be less than 4:
+    // 0 -> 8-bit elems,
+    // 1 -> 16-bit elems,
+    // 2 -> 32-bit elems,
+    // 3 -> 64-bit elems
+    assert(eSize <= 3);
+
+    XReg data;
+
+    int eBits = 8 << eSize;
+    int lsbPos = index * eBits;
+    assert(lsbPos < 128);
+    int shiftAmt = lsbPos % 64;
+
+    XReg maskBits = -1;
+    if (eBits == 64) {
+        maskBits = 0;
+    } else {
+        maskBits = maskBits << eBits;
+    }
+    maskBits = ~maskBits;
+
+    if (lsbPos < 64) {
+        data = (src.lo >> shiftAmt) & maskBits;
+    } else {
+        data = (src.hi >> shiftAmt) & maskBits;
+    }
+    return data;
+}
+
+}  // namespace ArmISA
+
+#endif  // __ARCH_ARM_INSTS_NEON64_MEM_HH__
diff --git a/src/arch/arm/insts/pred_inst.hh b/src/arch/arm/insts/pred_inst.hh
index c441d1f32..c5e2ab386 100644
--- a/src/arch/arm/insts/pred_inst.hh
+++ b/src/arch/arm/insts/pred_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010, 2012-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -78,7 +78,8 @@ modified_imm(uint8_t ctrlImm, uint8_t dataImm)
 }
 
 static inline uint64_t
-simd_modified_imm(bool op, uint8_t cmode, uint8_t data, bool &immValid)
+simd_modified_imm(bool op, uint8_t cmode, uint8_t data, bool &immValid,
+                  bool isAarch64 = false)
 {
     uint64_t bigData = data;
     immValid = true;
@@ -133,12 +134,20 @@ simd_modified_imm(bool op, uint8_t cmode, uint8_t data, bool &immValid)
         }
         break;
       case 0xf:
-        if (!op) {
-            uint64_t bVal = bits(bigData, 6) ? (0x1F) : (0x20);
-            bigData = (bits(bigData, 5, 0) << 19) |
-                      (bVal << 25) | (bits(bigData, 7) << 31);
-            bigData |= (bigData << 32);
-            break;
+        {
+            uint64_t bVal = 0;
+            if (!op) {
+                bVal = bits(bigData, 6) ? (0x1F) : (0x20);
+                bigData = (bits(bigData, 5, 0) << 19) |
+                          (bVal << 25) | (bits(bigData, 7) << 31);
+                bigData |= (bigData << 32);
+                break;
+            } else if (isAarch64) {
+                bVal = bits(bigData, 6) ? (0x0FF) : (0x100);
+                bigData = (bits(bigData, 5, 0) << 48) |
+                          (bVal << 54) | (bits(bigData, 7) << 63);
+                break;
+            }
         }
         // Fall through, immediate encoding is invalid.
       default:
@@ -179,11 +188,14 @@ class PredOp : public ArmStaticInst
 
     /// Constructor
     PredOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass) :
-           ArmStaticInst(mnem, _machInst, __opClass),
-           condCode(machInst.itstateMask ?
-                   (ConditionCode)(uint8_t)machInst.itstateCond :
-                   (ConditionCode)(unsigned)machInst.condCode)
+           ArmStaticInst(mnem, _machInst, __opClass)
     {
+        if (machInst.aarch64)
+            condCode = COND_UC;
+        else if (machInst.itstateMask)
+            condCode = (ConditionCode)(uint8_t)machInst.itstateCond;
+        else
+            condCode = (ConditionCode)(unsigned)machInst.condCode;
     }
 };
 
diff --git a/src/arch/arm/insts/static_inst.cc b/src/arch/arm/insts/static_inst.cc
index 2a8dee162..260c29a84 100644
--- a/src/arch/arm/insts/static_inst.cc
+++ b/src/arch/arm/insts/static_inst.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -86,6 +86,90 @@ ArmStaticInst::shift_rm_imm(uint32_t base, uint32_t shamt,
     return 0;
 }
 
+int64_t
+ArmStaticInst::shiftReg64(uint64_t base, uint64_t shiftAmt,
+                          ArmShiftType type, uint8_t width) const
+{
+    shiftAmt = shiftAmt % width;
+    ArmShiftType shiftType;
+    shiftType = (ArmShiftType)type;
+
+    switch (shiftType)
+    {
+      case LSL:
+        return base << shiftAmt;
+      case LSR:
+        if (shiftAmt == 0)
+            return base;
+        else
+            return (base & mask(width)) >> shiftAmt;
+      case ASR:
+        if (shiftAmt == 0) {
+            return base;
+        } else {
+            int sign_bit = bits(base, intWidth - 1);
+            base >>= shiftAmt;
+            base = sign_bit ? (base | ~mask(intWidth - shiftAmt)) : base;
+            return base & mask(intWidth);
+        }
+      case ROR:
+        if (shiftAmt == 0)
+            return base;
+        else
+            return (base << (width - shiftAmt)) | (base >> shiftAmt);
+      default:
+        ccprintf(std::cerr, "Unhandled shift type\n");
+        exit(1);
+        break;
+    }
+    return 0;
+}
+
+int64_t
+ArmStaticInst::extendReg64(uint64_t base, ArmExtendType type,
+                           uint64_t shiftAmt, uint8_t width) const
+{
+    bool sign_extend = false;
+    int len = 0;
+    switch (type) {
+      case UXTB:
+        len = 8;
+        break;
+      case UXTH:
+        len = 16;
+        break;
+      case UXTW:
+        len = 32;
+        break;
+      case UXTX:
+        len = 64;
+        break;
+      case SXTB:
+        len = 8;
+        sign_extend = true;
+        break;
+      case SXTH:
+        len = 16;
+        sign_extend = true;
+        break;
+      case SXTW:
+        len = 32;
+        sign_extend = true;
+        break;
+      case SXTX:
+        len = 64;
+        sign_extend = true;
+        break;
+    }
+    len = len <= width - shiftAmt ? len : width - shiftAmt;
+    uint64_t tmp = (uint64_t) bits(base, len - 1, 0) << shiftAmt;
+    if (sign_extend) {
+        int sign_bit = bits(tmp, len + shiftAmt - 1);
+        tmp = sign_bit ? (tmp | ~mask(len + shiftAmt)) : tmp;
+    }
+    return tmp & mask(width);
+}
+
 // Shift Rm by Rs
 int32_t
 ArmStaticInst::shift_rm_rs(uint32_t base, uint32_t shamt,
@@ -214,22 +298,33 @@ ArmStaticInst::printReg(std::ostream &os, int reg) const
 
     switch (regIdxToClass(reg, &rel_reg)) {
       case IntRegClass:
-        switch (rel_reg) {
-          case PCReg:
-            ccprintf(os, "pc");
-            break;
-          case StackPointerReg:
-            ccprintf(os, "sp");
-            break;
-          case FramePointerReg:
-            ccprintf(os, "fp");
-            break;
-          case ReturnAddressReg:
-            ccprintf(os, "lr");
-            break;
-          default:
-            ccprintf(os, "r%d", reg);
-            break;
+        if (aarch64) {
+            if (reg == INTREG_UREG0)
+                ccprintf(os, "ureg0");
+            else if (reg == INTREG_SPX)
+               ccprintf(os, "%s%s", (intWidth == 32) ? "w" : "", "sp");
+            else if (reg == INTREG_X31)
+                ccprintf(os, "%szr", (intWidth == 32) ? "w" : "x");
+            else
+                ccprintf(os, "%s%d", (intWidth == 32) ? "w" : "x", reg);
+        } else {
+            switch (rel_reg) {
+              case PCReg:
+                ccprintf(os, "pc");
+                break;
+              case StackPointerReg:
+                ccprintf(os, "sp");
+                break;
+              case FramePointerReg:
+                ccprintf(os, "fp");
+                break;
+              case ReturnAddressReg:
+                ccprintf(os, "lr");
+                break;
+              default:
+                ccprintf(os, "r%d", reg);
+                break;
+            }
         }
         break;
       case FloatRegClass:
@@ -247,67 +342,102 @@ ArmStaticInst::printReg(std::ostream &os, int reg) const
 void
 ArmStaticInst::printMnemonic(std::ostream &os,
                              const std::string &suffix,
-                             bool withPred) const
+                             bool withPred,
+                             bool withCond64,
+                             ConditionCode cond64) const
 {
     os << "  " << mnemonic;
-    if (withPred) {
-        unsigned condCode = machInst.condCode;
-        switch (condCode) {
-          case COND_EQ:
-            os << "eq";
-            break;
-          case COND_NE:
-            os << "ne";
-            break;
-          case COND_CS:
-            os << "cs";
-            break;
-          case COND_CC:
-            os << "cc";
-            break;
-          case COND_MI:
-            os << "mi";
-            break;
-          case COND_PL:
-            os << "pl";
-            break;
-          case COND_VS:
-            os << "vs";
-            break;
-          case COND_VC:
-            os << "vc";
-            break;
-          case COND_HI:
-            os << "hi";
-            break;
-          case COND_LS:
-            os << "ls";
-            break;
-          case COND_GE:
-            os << "ge";
-            break;
-          case COND_LT:
-            os << "lt";
-            break;
-          case COND_GT:
-            os << "gt";
-            break;
-          case COND_LE:
-            os << "le";
-            break;
-          case COND_AL:
-            // This one is implicit.
-            break;
-          case COND_UC:
-            // Unconditional.
-            break;
-          default:
-            panic("Unrecognized condition code %d.\n", condCode);
-        }
+    if (withPred && !aarch64) {
+        printCondition(os, machInst.condCode);
+        os << suffix;
+    } else if (withCond64) {
+        os << ".";
+        printCondition(os, cond64);
         os << suffix;
-        if (machInst.bigThumb)
-            os << ".w";
-        os << "   ";
+    }
+    if (machInst.bigThumb)
+        os << ".w";
+    os << "   ";
+}
+
+void
+ArmStaticInst::printTarget(std::ostream &os, Addr target,
+                           const SymbolTable *symtab) const
+{
+    Addr symbolAddr;
+    std::string symbol;
+
+    if (symtab && symtab->findNearestSymbol(target, symbol, symbolAddr)) {
+        ccprintf(os, "<%s", symbol);
+        if (symbolAddr != target)
+            ccprintf(os, "+%d>", target - symbolAddr);
+        else
+            ccprintf(os, ">");
+    } else {
+        ccprintf(os, "%#x", target);
+    }
+}
+
+void
+ArmStaticInst::printCondition(std::ostream &os,
+                              unsigned code,
+                              bool noImplicit) const
+{
+    switch (code) {
+      case COND_EQ:
+        os << "eq";
+        break;
+      case COND_NE:
+        os << "ne";
+        break;
+      case COND_CS:
+        os << "cs";
+        break;
+      case COND_CC:
+        os << "cc";
+        break;
+      case COND_MI:
+        os << "mi";
+        break;
+      case COND_PL:
+        os << "pl";
+        break;
+      case COND_VS:
+        os << "vs";
+        break;
+      case COND_VC:
+        os << "vc";
+        break;
+      case COND_HI:
+        os << "hi";
+        break;
+      case COND_LS:
+        os << "ls";
+        break;
+      case COND_GE:
+        os << "ge";
+        break;
+      case COND_LT:
+        os << "lt";
+        break;
+      case COND_GT:
+        os << "gt";
+        break;
+      case COND_LE:
+        os << "le";
+        break;
+      case COND_AL:
+        // This one is implicit.
+        if (noImplicit)
+            os << "al";
+        break;
+      case COND_UC:
+        // Unconditional.
+        if (noImplicit)
+            os << "uc";
+        break;
+      default:
+        panic("Unrecognized condition code %d.\n", code);
     }
 }
 
@@ -393,6 +523,38 @@ ArmStaticInst::printShiftOperand(std::ostream &os,
 }
 
 void
+ArmStaticInst::printExtendOperand(bool firstOperand, std::ostream &os,
+                                  IntRegIndex rm, ArmExtendType type,
+                                  int64_t shiftAmt) const
+{
+    if (!firstOperand)
+        ccprintf(os, ", ");
+    printReg(os, rm);
+    if (type == UXTX && shiftAmt == 0)
+        return;
+    switch (type) {
+      case UXTB: ccprintf(os, ", UXTB");
+        break;
+      case UXTH: ccprintf(os, ", UXTH");
+        break;
+      case UXTW: ccprintf(os, ", UXTW");
+        break;
+      case UXTX: ccprintf(os, ", LSL");
+        break;
+      case SXTB: ccprintf(os, ", SXTB");
+        break;
+      case SXTH: ccprintf(os, ", SXTH");
+        break;
+      case SXTW: ccprintf(os, ", SXTW");
+        break;
+      case SXTX: ccprintf(os, ", SXTW");
+        break;
+    }
+    if (type == UXTX || shiftAmt)
+        ccprintf(os, " #%d", shiftAmt);
+}
+
+void
 ArmStaticInst::printDataInst(std::ostream &os, bool withImm,
         bool immShift, bool s, IntRegIndex rd, IntRegIndex rn,
         IntRegIndex rm, IntRegIndex rs, uint32_t shiftAmt,
diff --git a/src/arch/arm/insts/static_inst.hh b/src/arch/arm/insts/static_inst.hh
index c36024ecd..aeec67ec2 100644
--- a/src/arch/arm/insts/static_inst.hh
+++ b/src/arch/arm/insts/static_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -44,6 +44,7 @@
 
 #include "arch/arm/faults.hh"
 #include "arch/arm/utility.hh"
+#include "arch/arm/system.hh"
 #include "base/trace.hh"
 #include "cpu/static_inst.hh"
 #include "sim/byteswap.hh"
@@ -55,6 +56,9 @@ namespace ArmISA
 class ArmStaticInst : public StaticInst
 {
   protected:
+    bool aarch64;
+    uint8_t intWidth;
+
     int32_t shift_rm_imm(uint32_t base, uint32_t shamt,
                          uint32_t type, uint32_t cfval) const;
     int32_t shift_rm_rs(uint32_t base, uint32_t shamt,
@@ -65,6 +69,11 @@ class ArmStaticInst : public StaticInst
     bool shift_carry_rs(uint32_t base, uint32_t shamt,
                         uint32_t type, uint32_t cfval) const;
 
+    int64_t shiftReg64(uint64_t base, uint64_t shiftAmt,
+                       ArmShiftType type, uint8_t width) const;
+    int64_t extendReg64(uint64_t base, ArmExtendType type,
+                        uint64_t shiftAmt, uint8_t width) const;
+
     template<int width>
     static inline bool
     saturateOp(int32_t &res, int64_t op1, int64_t op2, bool sub=false)
@@ -135,6 +144,11 @@ class ArmStaticInst : public StaticInst
                   OpClass __opClass)
         : StaticInst(mnem, _machInst, __opClass)
     {
+        aarch64 = machInst.aarch64;
+        if (bits(machInst, 28, 24) == 0x10)
+            intWidth = 64;  // Force 64-bit width for ADR/ADRP
+        else
+            intWidth = (aarch64 && bits(machInst, 31)) ? 64 : 32;
     }
 
     /// Print a register name for disassembly given the unique
@@ -142,13 +156,22 @@ class ArmStaticInst : public StaticInst
     void printReg(std::ostream &os, int reg) const;
     void printMnemonic(std::ostream &os,
                        const std::string &suffix = "",
-                       bool withPred = true) const;
+                       bool withPred = true,
+                       bool withCond64 = false,
+                       ConditionCode cond64 = COND_UC) const;
+    void printTarget(std::ostream &os, Addr target,
+                     const SymbolTable *symtab) const;
+    void printCondition(std::ostream &os, unsigned code,
+                        bool noImplicit=false) const;
     void printMemSymbol(std::ostream &os, const SymbolTable *symtab,
                         const std::string &prefix, const Addr addr,
                         const std::string &suffix) const;
     void printShiftOperand(std::ostream &os, IntRegIndex rm,
                            bool immShift, uint32_t shiftAmt,
                            IntRegIndex rs, ArmShiftType type) const;
+    void printExtendOperand(bool firstOperand, std::ostream &os,
+                            IntRegIndex rm, ArmExtendType type,
+                            int64_t shiftAmt) const;
 
 
     void printDataInst(std::ostream &os, bool withImm) const;
@@ -166,10 +189,13 @@ class ArmStaticInst : public StaticInst
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 
     static inline uint32_t
-    cpsrWriteByInstr(CPSR cpsr, uint32_t val,
-            uint8_t byteMask, bool affectState, bool nmfi)
+    cpsrWriteByInstr(CPSR cpsr, uint32_t val, SCR scr, NSACR nsacr,
+            uint8_t byteMask, bool affectState, bool nmfi, ThreadContext *tc)
     {
-        bool privileged = (cpsr.mode != MODE_USER);
+        bool privileged   = (cpsr.mode != MODE_USER);
+        bool haveVirt     = ArmSystem::haveVirtualization(tc);
+        bool haveSecurity = ArmSystem::haveSecurity(tc);
+        bool isSecure     = inSecureState(scr, cpsr) || !haveSecurity;
 
         uint32_t bitMask = 0;
 
@@ -182,14 +208,53 @@ class ArmStaticInst : public StaticInst
         }
         if (bits(byteMask, 1)) {
             unsigned highIdx = affectState ? 15 : 9;
-            unsigned lowIdx = privileged ? 8 : 9;
+            unsigned lowIdx = (privileged && (isSecure || scr.aw || haveVirt))
+                            ? 8 : 9;
             bitMask = bitMask | mask(highIdx, lowIdx);
         }
         if (bits(byteMask, 0)) {
             if (privileged) {
-                bitMask = bitMask | mask(7, 6);
-                if (!badMode((OperatingMode)(val & mask(5)))) {
-                    bitMask = bitMask | mask(5);
+                bitMask |= 1 << 7;
+                if ( (!nmfi || !((val >> 6) & 0x1)) &&
+                     (isSecure || scr.fw || haveVirt) ) {
+                    bitMask |= 1 << 6;
+                }
+                // Now check the new mode is allowed
+                OperatingMode newMode = (OperatingMode) (val & mask(5));
+                OperatingMode oldMode = (OperatingMode)(uint32_t)cpsr.mode;
+                if (!badMode(newMode)) {
+                    bool validModeChange = true;
+                    // Check for attempts to enter modes only permitted in
+                    // Secure state from Non-secure state. These are Monitor
+                    // mode ('10110'), and FIQ mode ('10001') if the Security
+                    // Extensions have reserved it.
+                    if (!isSecure && newMode == MODE_MON)
+                        validModeChange = false;
+                    if (!isSecure && newMode == MODE_FIQ && nsacr.rfr == '1')
+                        validModeChange = false;
+                    // There is no Hyp mode ('11010') in Secure state, so that
+                    // is UNPREDICTABLE
+                    if (scr.ns == '0' && newMode == MODE_HYP)
+                        validModeChange = false;
+                    // Cannot move into Hyp mode directly from a Non-secure
+                    // PL1 mode
+                    if (!isSecure && oldMode != MODE_HYP && newMode == MODE_HYP)
+                        validModeChange = false;
+                    // Cannot move out of Hyp mode with this function except
+                    // on an exception return
+                    if (oldMode == MODE_HYP && newMode != MODE_HYP && !affectState)
+                        validModeChange = false;
+                    // Must not change to 64 bit when running in 32 bit mode
+                    if (!opModeIs64(oldMode) && opModeIs64(newMode))
+                        validModeChange = false;
+
+                    // If we passed all of the above then set the bit mask to
+                    // copy the mode accross
+                    if (validModeChange) {
+                        bitMask = bitMask | mask(5);
+                    } else {
+                        warn_once("Illegal change to CPSR mode attempted\n");
+                    }
                 } else {
                     warn_once("Ignoring write of bad mode to CPSR.\n");
                 }
@@ -198,11 +263,7 @@ class ArmStaticInst : public StaticInst
                 bitMask = bitMask | (1 << 5);
         }
 
-        bool cpsr_f = cpsr.f;
-        uint32_t new_cpsr = ((uint32_t)cpsr & ~bitMask) | (val & bitMask);
-        if (nmfi && !cpsr_f)
-            new_cpsr &= ~(1 << 6);
-        return new_cpsr;
+        return ((uint32_t)cpsr & ~bitMask) | (val & bitMask);
     }
 
     static inline uint32_t
@@ -296,12 +357,12 @@ class ArmStaticInst : public StaticInst
     inline Fault
     disabledFault() const
     {
-        if (FullSystem) {
-            return new UndefinedInstruction();
-        } else {
-            return new UndefinedInstruction(machInst, false, mnemonic, true);
-        }
+        return new UndefinedInstruction(machInst, false, mnemonic, true);
     }
+
+  public:
+    virtual void
+    annotateFault(ArmFault *fault) {}
 };
 }
 
diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc
index ca0f58226..03fdc83fa 100644
--- a/src/arch/arm/insts/vfp.cc
+++ b/src/arch/arm/insts/vfp.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -46,6 +46,37 @@
  */
 
 std::string
+FpCondCompRegOp::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", #%d", defCc);
+    ccprintf(ss, ", ");
+    printCondition(ss, condCode, true);
+    return ss.str();
+}
+
+std::string
+FpCondSelOp::generateDisassembly(
+        Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss, "", false);
+    printReg(ss, dest);
+    ccprintf(ss, ", ");
+    printReg(ss, op1);
+    ccprintf(ss, ", ");
+    printReg(ss, op2);
+    ccprintf(ss, ", ");
+    printCondition(ss, condCode, true);
+    return ss.str();
+}
+
+std::string
 FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
@@ -92,6 +123,21 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 }
 
 std::string
+FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    printMnemonic(ss);
+    printReg(ss, dest + FP_Reg_Base);
+    ss << ", ";
+    printReg(ss, op1 + FP_Reg_Base);
+    ss << ", ";
+    printReg(ss, op2 + FP_Reg_Base);
+    ss << ", ";
+    printReg(ss, op3 + FP_Reg_Base);
+    return ss.str();
+}
+
+std::string
 FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 {
     std::stringstream ss;
@@ -131,24 +177,25 @@ prepFpState(uint32_t rMode)
 }
 
 void
-finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush)
+finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
 {
     int exceptions = fetestexcept(FeAllExceptions);
     bool underflow = false;
-    if (exceptions & FeInvalid) {
+    if ((exceptions & FeInvalid) && mask.ioc) {
         fpscr.ioc = 1;
     }
-    if (exceptions & FeDivByZero) {
+    if ((exceptions & FeDivByZero) && mask.dzc) {
         fpscr.dzc = 1;
     }
-    if (exceptions & FeOverflow) {
+    if ((exceptions & FeOverflow) && mask.ofc) {
         fpscr.ofc = 1;
     }
     if (exceptions & FeUnderflow) {
         underflow = true;
-        fpscr.ufc = 1;
+        if (mask.ufc)
+            fpscr.ufc = 1;
     }
-    if ((exceptions & FeInexact) && !(underflow && flush)) {
+    if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
         fpscr.ixc = 1;
     }
     fesetround(state);
@@ -329,19 +376,33 @@ fixFpSFpDDest(FPSCR fpscr, float val)
     return mid;
 }
 
-uint16_t
-vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
-           uint32_t rMode, bool ahp, float op)
+static inline uint16_t
+vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+          uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
 {
-    uint32_t opBits = fpToBits(op);
+    uint32_t mWidth;
+    uint32_t eWidth;
+    uint32_t eHalfRange;
+    uint32_t sBitPos;
+
+    if (isDouble) {
+        mWidth = 52;
+        eWidth = 11;
+    } else {
+        mWidth = 23;
+        eWidth = 8;
+    }
+    sBitPos    = eWidth + mWidth;
+    eHalfRange = (1 << (eWidth-1)) - 1;
+
     // Extract the operand.
-    bool neg = bits(opBits, 31);
-    uint32_t exponent = bits(opBits, 30, 23);
-    uint32_t oldMantissa = bits(opBits, 22, 0);
-    uint32_t mantissa = oldMantissa >> (23 - 10);
+    bool neg = bits(opBits, sBitPos);
+    uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
+    uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
+    uint32_t mantissa = oldMantissa >> (mWidth - 10);
     // Do the conversion.
-    uint32_t extra = oldMantissa & mask(23 - 10);
-    if (exponent == 0xff) {
+    uint64_t extra = oldMantissa & mask(mWidth - 10);
+    if (exponent == mask(eWidth)) {
         if (oldMantissa != 0) {
             // Nans.
             if (bits(mantissa, 9) == 0) {
@@ -379,7 +440,6 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 
         if (exponent == 0) {
             // Denormalized.
-
             // If flush to zero is on, this shouldn't happen.
             assert(!flush);
 
@@ -407,13 +467,13 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 
             // We need to track the dropped bits differently since
             // more can be dropped by denormalizing.
-            bool topOne = bits(extra, 12);
-            bool restZeros = bits(extra, 11, 0) == 0;
+            bool topOne = bits(extra, mWidth - 10 - 1);
+            bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
 
-            if (exponent <= (127 - 15)) {
+            if (exponent <= (eHalfRange - 15)) {
                 // The result is too small. Denormalize.
                 mantissa |= (1 << 10);
-                while (mantissa && exponent <= (127 - 15)) {
+                while (mantissa && exponent <= (eHalfRange - 15)) {
                     restZeros = restZeros && !topOne;
                     topOne = bits(mantissa, 0);
                     mantissa = mantissa >> 1;
@@ -424,7 +484,7 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
                 exponent = 0;
             } else {
                 // Change bias.
-                exponent -= (127 - 15);
+                exponent -= (eHalfRange - 15);
             }
 
             if (exponent == 0 && (inexact || fpscr.ufe)) {
@@ -488,155 +548,115 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
     return result;
 }
 
-float
-vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
+uint16_t
+vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+           uint32_t rMode, bool ahp, float op)
 {
-    float junk = 0.0;
+    uint64_t opBits = fpToBits(op);
+    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
+}
+
+uint16_t
+vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+           uint32_t rMode, bool ahp, double op)
+{
+    uint64_t opBits = fpToBits(op);
+    return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
+}
+
+static inline uint64_t
+vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
+{
+    uint32_t mWidth;
+    uint32_t eWidth;
+    uint32_t eHalfRange;
+    uint32_t sBitPos;
+
+    if (isDouble) {
+        mWidth = 52;
+        eWidth = 11;
+    } else {
+        mWidth = 23;
+        eWidth = 8;
+    }
+    sBitPos    = eWidth + mWidth;
+    eHalfRange = (1 << (eWidth-1)) - 1;
+
     // Extract the bitfields.
     bool neg = bits(op, 15);
     uint32_t exponent = bits(op, 14, 10);
-    uint32_t mantissa = bits(op, 9, 0);
+    uint64_t mantissa = bits(op, 9, 0);
     // Do the conversion.
     if (exponent == 0) {
         if (mantissa != 0) {
             // Normalize the value.
-            exponent = exponent + (127 - 15) + 1;
+            exponent = exponent + (eHalfRange - 15) + 1;
             while (mantissa < (1 << 10)) {
                 mantissa = mantissa << 1;
                 exponent--;
             }
         }
-        mantissa = mantissa << (23 - 10);
+        mantissa = mantissa << (mWidth - 10);
     } else if (exponent == 0x1f && !ahp) {
         // Infinities and nans.
-        exponent = 0xff;
+        exponent = mask(eWidth);
         if (mantissa != 0) {
             // Nans.
-            mantissa = mantissa << (23 - 10);
-            if (bits(mantissa, 22) == 0) {
+            mantissa = mantissa << (mWidth - 10);
+            if (bits(mantissa, mWidth-1) == 0) {
                 // Signalling nan.
                 fpscr.ioc = 1;
-                mantissa |= (1 << 22);
+                mantissa |= (((uint64_t) 1) << (mWidth-1));
             }
             if (defaultNan) {
-                mantissa &= ~mask(22);
+                mantissa &= ~mask(mWidth-1);
                 neg = false;
             }
         }
     } else {
-        exponent = exponent + (127 - 15);
-        mantissa = mantissa << (23 - 10);
+        exponent = exponent + (eHalfRange - 15);
+        mantissa = mantissa << (mWidth - 10);
     }
     // Reassemble the result.
-    uint32_t result = bits(mantissa, 22, 0);
-    replaceBits(result, 30, 23, exponent);
-    if (neg)
-        result |= (1 << 31);
+    uint64_t result = bits(mantissa, mWidth-1, 0);
+    replaceBits(result, sBitPos-1, mWidth, exponent);
+    if (neg) {
+        result |= (((uint64_t) 1) << sBitPos);
+    }
+    return result;
+}
+
+double
+vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
+{
+    double junk = 0.0;
+    uint64_t result;
+
+    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
     return bitsToFp(result, junk);
 }
 
-uint64_t
-vfpFpSToFixed(float val, bool isSigned, bool half,
-              uint8_t imm, bool rzero)
+float
+vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 {
-    int rmode = rzero ? FeRoundZero : fegetround();
-    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
-    fesetround(FeRoundNearest);
-    val = val * powf(2.0, imm);
-    __asm__ __volatile__("" : "=m" (val) : "m" (val));
-    fesetround(rmode);
-    feclearexcept(FeAllExceptions);
-    __asm__ __volatile__("" : "=m" (val) : "m" (val));
-    float origVal = val;
-    val = rintf(val);
-    int fpType = std::fpclassify(val);
-    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
-        if (fpType == FP_NAN) {
-            feraiseexcept(FeInvalid);
-        }
-        val = 0.0;
-    } else if (origVal != val) {
-        switch (rmode) {
-          case FeRoundNearest:
-            if (origVal - val > 0.5)
-                val += 1.0;
-            else if (val - origVal > 0.5)
-                val -= 1.0;
-            break;
-          case FeRoundDown:
-            if (origVal < val)
-                val -= 1.0;
-            break;
-          case FeRoundUpward:
-            if (origVal > val)
-                val += 1.0;
-            break;
-        }
-        feraiseexcept(FeInexact);
-    }
+    float junk = 0.0;
+    uint64_t result;
 
-    if (isSigned) {
-        if (half) {
-            if ((double)val < (int16_t)(1 << 15)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int16_t)(1 << 15);
-            }
-            if ((double)val > (int16_t)mask(15)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int16_t)mask(15);
-            }
-            return (int16_t)val;
-        } else {
-            if ((double)val < (int32_t)(1 << 31)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int32_t)(1 << 31);
-            }
-            if ((double)val > (int32_t)mask(31)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int32_t)mask(31);
-            }
-            return (int32_t)val;
-        }
-    } else {
-        if (half) {
-            if ((double)val < 0) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return 0;
-            }
-            if ((double)val > (mask(16))) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return mask(16);
-            }
-            return (uint16_t)val;
-        } else {
-            if ((double)val < 0) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return 0;
-            }
-            if ((double)val > (mask(32))) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return mask(32);
-            }
-            return (uint32_t)val;
-        }
-    }
+    result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
+    return bitsToFp(result, junk);
 }
 
 float
 vfpUFixedToFpS(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm)
+        uint64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
-    if (half)
+    if (width == 16)
         val = (uint16_t)val;
+    else if (width == 32)
+        val = (uint32_t)val;
+    else if (width != 64)
+        panic("Unsupported width %d", width);
     float scale = powf(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
@@ -646,11 +666,16 @@ vfpUFixedToFpS(bool flush, bool defaultNan,
 
 float
 vfpSFixedToFpS(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm)
+        int64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
-    if (half)
+    if (width == 16)
         val = sext<16>(val & mask(16));
+    else if (width == 32)
+        val = sext<32>(val & mask(32));
+    else if (width != 64)
+        panic("Unsupported width %d", width);
+
     float scale = powf(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
@@ -658,106 +683,19 @@ vfpSFixedToFpS(bool flush, bool defaultNan,
     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 }
 
-uint64_t
-vfpFpDToFixed(double val, bool isSigned, bool half,
-              uint8_t imm, bool rzero)
-{
-    int rmode = rzero ? FeRoundZero : fegetround();
-    fesetround(FeRoundNearest);
-    val = val * pow(2.0, imm);
-    __asm__ __volatile__("" : "=m" (val) : "m" (val));
-    fesetround(rmode);
-    feclearexcept(FeAllExceptions);
-    __asm__ __volatile__("" : "=m" (val) : "m" (val));
-    double origVal = val;
-    val = rint(val);
-    int fpType = std::fpclassify(val);
-    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
-        if (fpType == FP_NAN) {
-            feraiseexcept(FeInvalid);
-        }
-        val = 0.0;
-    } else if (origVal != val) {
-        switch (rmode) {
-          case FeRoundNearest:
-            if (origVal - val > 0.5)
-                val += 1.0;
-            else if (val - origVal > 0.5)
-                val -= 1.0;
-            break;
-          case FeRoundDown:
-            if (origVal < val)
-                val -= 1.0;
-            break;
-          case FeRoundUpward:
-            if (origVal > val)
-                val += 1.0;
-            break;
-        }
-        feraiseexcept(FeInexact);
-    }
-    if (isSigned) {
-        if (half) {
-            if (val < (int16_t)(1 << 15)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int16_t)(1 << 15);
-            }
-            if (val > (int16_t)mask(15)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int16_t)mask(15);
-            }
-            return (int16_t)val;
-        } else {
-            if (val < (int32_t)(1 << 31)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int32_t)(1 << 31);
-            }
-            if (val > (int32_t)mask(31)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return (int32_t)mask(31);
-            }
-            return (int32_t)val;
-        }
-    } else {
-        if (half) {
-            if (val < 0) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return 0;
-            }
-            if (val > mask(16)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return mask(16);
-            }
-            return (uint16_t)val;
-        } else {
-            if (val < 0) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return 0;
-            }
-            if (val > mask(32)) {
-                feraiseexcept(FeInvalid);
-                feclearexcept(FeInexact);
-                return mask(32);
-            }
-            return (uint32_t)val;
-        }
-    }
-}
 
 double
 vfpUFixedToFpD(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm)
+        uint64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
-    if (half)
+    if (width == 16)
         val = (uint16_t)val;
+    else if (width == 32)
+        val = (uint32_t)val;
+    else if (width != 64)
+        panic("Unsupported width %d", width);
+
     double scale = pow(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
@@ -767,11 +705,16 @@ vfpUFixedToFpD(bool flush, bool defaultNan,
 
 double
 vfpSFixedToFpD(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm)
+        int64_t val, uint8_t width, uint8_t imm)
 {
     fesetround(FeRoundNearest);
-    if (half)
+    if (width == 16)
         val = sext<16>(val & mask(16));
+    else if (width == 32)
+        val = sext<32>(val & mask(32));
+    else if (width != 64)
+        panic("Unsupported width %d", width);
+
     double scale = pow(2.0, imm);
     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
     feclearexcept(FeAllExceptions);
@@ -976,6 +919,85 @@ template
 double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
                          double op1, double op2) const;
 
+// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
+template <class fpType>
+fpType
+FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
+                fpType (*func)(fpType, fpType, fpType),
+                bool flush, bool defaultNan, uint32_t rMode) const
+{
+    const bool single = (sizeof(fpType) == sizeof(float));
+    fpType junk = 0.0;
+
+    if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
+        fpscr.idc = 1;
+    VfpSavedState state = prepFpState(rMode);
+    __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
+                             :  "m" (op1),  "m" (op2),  "m" (op3),  "m" (state));
+    fpType dest = func(op1, op2, op3);
+    __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
+
+    int fpClass = std::fpclassify(dest);
+    // Get NAN behavior right. This varies between x86 and ARM.
+    if (fpClass == FP_NAN) {
+        const uint64_t qnan =
+            single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+        const bool nan1 = std::isnan(op1);
+        const bool nan2 = std::isnan(op2);
+        const bool nan3 = std::isnan(op3);
+        const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
+        const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
+        const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
+        if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
+            dest = bitsToFp(qnan, junk);
+        } else if (signal1) {
+            dest = bitsToFp(fpToBits(op1) | qnan, junk);
+        } else if (signal2) {
+            dest = bitsToFp(fpToBits(op2) | qnan, junk);
+        } else if (signal3) {
+            dest = bitsToFp(fpToBits(op3) | qnan, junk);
+        } else if (nan1) {
+            dest = op1;
+        } else if (nan2) {
+            dest = op2;
+        } else if (nan3) {
+            dest = op3;
+        }
+    } else if (flush && flushToZero(dest)) {
+        feraiseexcept(FeUnderflow);
+    } else if ((
+                (single && (dest == bitsToFp(0x00800000, junk) ||
+                     dest == bitsToFp(0x80800000, junk))) ||
+                (!single &&
+                    (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
+                     dest == bitsToFp(ULL(0x8010000000000000), junk)))
+               ) && rMode != VfpRoundZero) {
+        /*
+         * Correct for the fact that underflow is detected -before- rounding
+         * in ARM and -after- rounding in x86.
+         */
+        fesetround(FeRoundZero);
+        __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
+                                 :  "m" (op1),  "m" (op2),  "m" (op3));
+        fpType temp = func(op1, op2, op2);
+        __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
+        if (flush && flushToZero(temp)) {
+            dest = temp;
+        }
+    }
+    finishVfp(fpscr, state, flush);
+    return dest;
+}
+
+template
+float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
+                      float (*func)(float, float, float),
+                      bool flush, bool defaultNan, uint32_t rMode) const;
+template
+double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
+                       double (*func)(double, double, double),
+                       bool flush, bool defaultNan, uint32_t rMode) const;
+
 template <class fpType>
 fpType
 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh
index 9babaae04..f17f90973 100644
--- a/src/arch/arm/insts/vfp.hh
+++ b/src/arch/arm/insts/vfp.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -104,7 +104,8 @@ enum VfpRoundingMode
     VfpRoundNearest = 0,
     VfpRoundUpward = 1,
     VfpRoundDown = 2,
-    VfpRoundZero = 3
+    VfpRoundZero = 3,
+    VfpRoundAway = 4
 };
 
 static inline float bitsToFp(uint64_t, float);
@@ -212,7 +213,7 @@ isSnan(fpType val)
 typedef int VfpSavedState;
 
 VfpSavedState prepFpState(uint32_t rMode);
-void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush);
+void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask = FpscrExcMask);
 
 template <class fpType>
 fpType fixDest(FPSCR fpscr, fpType val, fpType op1);
@@ -228,7 +229,11 @@ double fixFpSFpDDest(FPSCR fpscr, float val);
 
 uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
                     uint32_t rMode, bool ahp, float op);
-float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
+uint16_t vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+                    uint32_t rMode, bool ahp, double op);
+
+float  vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
+double vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
 
 static inline double
 makeDouble(uint32_t low, uint32_t high)
@@ -249,19 +254,192 @@ highFromDouble(double val)
     return fpToBits(val) >> 32;
 }
 
-uint64_t vfpFpSToFixed(float val, bool isSigned, bool half,
-                       uint8_t imm, bool rzero = true);
+static inline void
+setFPExceptions(int exceptions) {
+    feclearexcept(FeAllExceptions);
+    feraiseexcept(exceptions);
+}
+
+template <typename T>
+uint64_t
+vfpFpToFixed(T val, bool isSigned, uint8_t width, uint8_t imm, bool
+             useRmode = true, VfpRoundingMode roundMode = VfpRoundZero,
+             bool aarch64 = false)
+{
+    int  rmode;
+    bool roundAwayFix = false;
+
+    if (!useRmode) {
+        rmode = fegetround();
+    } else {
+        switch (roundMode)
+        {
+          case VfpRoundNearest:
+            rmode = FeRoundNearest;
+            break;
+          case VfpRoundUpward:
+            rmode = FeRoundUpward;
+            break;
+          case VfpRoundDown:
+            rmode = FeRoundDown;
+            break;
+          case VfpRoundZero:
+            rmode = FeRoundZero;
+            break;
+          case VfpRoundAway:
+            // There is no equivalent rounding mode, use round down and we'll
+            // fix it later
+            rmode        = FeRoundDown;
+            roundAwayFix = true;
+            break;
+          default:
+            panic("Unsupported roundMode %d\n", roundMode);
+        }
+    }
+    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
+    fesetround(FeRoundNearest);
+    val = val * pow(2.0, imm);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    fesetround(rmode);
+    feclearexcept(FeAllExceptions);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    T origVal = val;
+    val = rint(val);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+
+    int exceptions = fetestexcept(FeAllExceptions);
+
+    int fpType = std::fpclassify(val);
+    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
+        if (fpType == FP_NAN) {
+            exceptions |= FeInvalid;
+        }
+        val = 0.0;
+    } else if (origVal != val) {
+        switch (rmode) {
+          case FeRoundNearest:
+            if (origVal - val > 0.5)
+                val += 1.0;
+            else if (val - origVal > 0.5)
+                val -= 1.0;
+            break;
+          case FeRoundDown:
+            if (roundAwayFix) {
+                // The ordering on the subtraction looks a bit odd in that we
+                // don't do the obvious origVal - val, instead we do
+                // -(val - origVal). This is required to get the corruct bit
+                // exact behaviour when very close to the 0.5 threshold.
+                volatile T error = val;
+                error -= origVal;
+                error = -error;
+                if ( (error >  0.5) ||
+                    ((error == 0.5) && (val >= 0)) )
+                    val += 1.0;
+            } else {
+                if (origVal < val)
+                    val -= 1.0;
+            }
+            break;
+          case FeRoundUpward:
+            if (origVal > val)
+                val += 1.0;
+            break;
+        }
+        exceptions |= FeInexact;
+    }
+
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+
+    if (isSigned) {
+        bool     outOfRange = false;
+        int64_t  result     = (int64_t) val;
+        uint64_t finalVal;
+
+        if (!aarch64) {
+            if (width == 16) {
+                finalVal = (int16_t)val;
+            } else if (width == 32) {
+                finalVal =(int32_t)val;
+            } else if (width == 64) {
+                finalVal = result;
+            } else {
+                panic("Unsupported width %d\n", width);
+            }
+
+            // check if value is in range
+            int64_t minVal = ~mask(width-1);
+            if ((double)val < minVal) {
+                outOfRange = true;
+                finalVal = minVal;
+            }
+            int64_t maxVal = mask(width-1);
+            if ((double)val > maxVal) {
+                outOfRange = true;
+                finalVal = maxVal;
+            }
+        } else {
+            bool isNeg = val < 0;
+            finalVal = result & mask(width);
+            // If the result is supposed to be less than 64 bits check that the
+            // upper bits that got thrown away are just sign extension bits
+            if (width != 64) {
+                outOfRange = ((uint64_t) result >> (width - 1)) !=
+                             (isNeg ? mask(64-width+1) : 0);
+            }
+            // Check if the original floating point value doesn't matches the
+            // integer version we are also out of range. So create a saturated
+            // result.
+            if (isNeg) {
+                outOfRange |= val < result;
+                if (outOfRange) {
+                    finalVal = 1LL << (width-1);
+                }
+            } else {
+                outOfRange |= val > result;
+                if (outOfRange) {
+                    finalVal = mask(width-1);
+                }
+            }
+        }
+
+        // Raise an exception if the value was out of range
+        if (outOfRange) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+        }
+        setFPExceptions(exceptions);
+        return finalVal;
+    } else {
+        if ((double)val < 0) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+            setFPExceptions(exceptions);
+            return 0;
+        }
+
+        uint64_t result = ((uint64_t) val) & mask(width);
+        if (val > result) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+            setFPExceptions(exceptions);
+            return mask(width);
+        }
+
+        setFPExceptions(exceptions);
+        return result;
+    }
+};
+
+
 float vfpUFixedToFpS(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm);
+        uint64_t val, uint8_t width, uint8_t imm);
 float vfpSFixedToFpS(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm);
+        int64_t val, uint8_t width, uint8_t imm);
 
-uint64_t vfpFpDToFixed(double val, bool isSigned, bool half,
-                       uint8_t imm, bool rzero = true);
 double vfpUFixedToFpD(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm);
+        uint64_t val, uint8_t width, uint8_t imm);
 double vfpSFixedToFpD(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm);
+        int64_t val, uint8_t width, uint8_t imm);
 
 float fprSqrtEstimate(FPSCR &fpscr, float op);
 uint32_t unsignedRSqrtEstimate(uint32_t op);
@@ -292,6 +470,20 @@ class VfpMacroOp : public PredMacroOp
     void nextIdxs(IntRegIndex &dest);
 };
 
+template <typename T>
+static inline T
+fpAdd(T a, T b)
+{
+    return a + b;
+};
+
+template <typename T>
+static inline T
+fpSub(T a, T b)
+{
+    return a - b;
+};
+
 static inline float
 fpAddS(float a, float b)
 {
@@ -328,6 +520,54 @@ fpDivD(double a, double b)
     return a / b;
 }
 
+template <typename T>
+static inline T
+fpDiv(T a, T b)
+{
+    return a / b;
+};
+
+template <typename T>
+static inline T
+fpMulX(T a, T b)
+{
+    uint64_t opData;
+    uint32_t sign1;
+    uint32_t sign2;
+    const bool single = (sizeof(T) == sizeof(float));
+    if (single) {
+        opData = (fpToBits(a));
+        sign1 = opData>>31;
+        opData = (fpToBits(b));
+        sign2 = opData>>31;
+    } else {
+        opData = (fpToBits(a));
+        sign1 = opData>>63;
+        opData = (fpToBits(b));
+        sign2 = opData>>63;
+    }
+    bool inf1 = (std::fpclassify(a) == FP_INFINITE);
+    bool inf2 = (std::fpclassify(b) == FP_INFINITE);
+    bool zero1 = (std::fpclassify(a) == FP_ZERO);
+    bool zero2 = (std::fpclassify(b) == FP_ZERO);
+    if ((inf1 && zero2) || (zero1 && inf2)) {
+        if(sign1 ^ sign2)
+            return (T)(-2.0);
+        else
+            return (T)(2.0);
+    } else {
+        return (a * b);
+    }
+};
+
+
+template <typename T>
+static inline T
+fpMul(T a, T b)
+{
+    return a * b;
+};
+
 static inline float
 fpMulS(float a, float b)
 {
@@ -340,23 +580,140 @@ fpMulD(double a, double b)
     return a * b;
 }
 
-static inline float
-fpMaxS(float a, float b)
+template <typename T>
+static inline T
+// @todo remove this when all calls to it have been replaced with the new fplib implementation
+fpMulAdd(T op1, T op2, T addend)
+{
+    T result;
+
+    if (sizeof(T) == sizeof(float))
+        result = fmaf(op1, op2, addend);
+    else
+        result = fma(op1, op2, addend);
+
+    // ARM doesn't generate signed nan's from this opperation, so fix up the result
+    if (std::isnan(result) && !std::isnan(op1) &&
+        !std::isnan(op2) && !std::isnan(addend))
+    {
+        uint64_t bitMask = ULL(0x1) << ((sizeof(T) * 8) - 1);
+        result = bitsToFp(fpToBits(result) & ~bitMask, op1);
+    }
+    return result;
+}
+
+template <typename T>
+static inline T
+fpRIntX(T a, FPSCR &fpscr)
+{
+    T rVal;
+
+    rVal = rint(a);
+    if (rVal != a && !std::isnan(a))
+        fpscr.ixc = 1;
+    return (rVal);
+};
+
+template <typename T>
+static inline T
+fpMaxNum(T a, T b)
 {
+    const bool     single = (sizeof(T) == sizeof(float));
+    const uint64_t qnan   = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+
+    if (std::isnan(a))
+        return ((fpToBits(a) & qnan) == qnan) ? b : a;
+    if (std::isnan(b))
+        return ((fpToBits(b) & qnan) == qnan) ? a : b;
     // Handle comparisons of +0 and -0.
     if (!std::signbit(a) && std::signbit(b))
         return a;
-    return fmaxf(a, b);
-}
+    return fmax(a, b);
+};
 
-static inline float
-fpMinS(float a, float b)
+template <typename T>
+static inline T
+fpMax(T a, T b)
 {
+    if (std::isnan(a))
+        return a;
+    if (std::isnan(b))
+        return b;
+    return fpMaxNum<T>(a, b);
+};
+
+template <typename T>
+static inline T
+fpMinNum(T a, T b)
+{
+    const bool     single = (sizeof(T) == sizeof(float));
+    const uint64_t qnan   = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+
+    if (std::isnan(a))
+        return ((fpToBits(a) & qnan) == qnan) ? b : a;
+    if (std::isnan(b))
+        return ((fpToBits(b) & qnan) == qnan) ? a : b;
     // Handle comparisons of +0 and -0.
     if (std::signbit(a) && !std::signbit(b))
         return a;
-    return fminf(a, b);
-}
+    return fmin(a, b);
+};
+
+template <typename T>
+static inline T
+fpMin(T a, T b)
+{
+    if (std::isnan(a))
+        return a;
+    if (std::isnan(b))
+        return b;
+    return fpMinNum<T>(a, b);
+};
+
+template <typename T>
+static inline T
+fpRSqrts(T a, T b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    T aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 1.5;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 1.5;
+    }
+    return (3.0 - (a * b)) / 2.0;
+};
+
+template <typename T>
+static inline T
+fpRecps(T a, T b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    T aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 2.0;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 2.0;
+    }
+    return 2.0 - (a * b);
+};
+
 
 static inline float
 fpRSqrtsS(float a, float b)
@@ -400,6 +757,23 @@ fpRecpsS(float a, float b)
     return 2.0 - (a * b);
 }
 
+template <typename T>
+static inline T
+roundNEven(T a) {
+    T val;
+
+    val = round(a);
+    if (a - val == 0.5) {
+        if ( (((int) a) & 1) == 0 ) val += 1.0;
+    }
+    else if (a - val == -0.5) {
+        if ( (((int) a) & 1) == 0 ) val -= 1.0;
+    }
+    return val;
+}
+
+
+
 class FpOp : public PredOp
 {
   protected:
@@ -457,6 +831,12 @@ class FpOp : public PredOp
 
     template <class fpType>
     fpType
+    ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
+              fpType (*func)(fpType, fpType, fpType),
+              bool flush, bool defaultNan, uint32_t rMode) const;
+
+    template <class fpType>
+    fpType
     binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
             fpType (*func)(fpType, fpType),
             bool flush, bool defaultNan, uint32_t rMode) const;
@@ -478,6 +858,55 @@ class FpOp : public PredOp
             pcState.advance();
         }
     }
+
+    float
+    fpSqrt (FPSCR fpscr,float x) const
+    {
+
+        return unaryOp(fpscr,x,sqrtf,fpscr.fz,fpscr.rMode);
+
+    }
+
+    double
+    fpSqrt (FPSCR fpscr,double x) const
+    {
+
+        return unaryOp(fpscr,x,sqrt,fpscr.fz,fpscr.rMode);
+
+    }
+};
+
+class FpCondCompRegOp : public FpOp
+{
+  protected:
+    IntRegIndex op1, op2;
+    ConditionCode condCode;
+    uint8_t defCc;
+
+    FpCondCompRegOp(const char *mnem, ExtMachInst _machInst,
+                       OpClass __opClass, IntRegIndex _op1, IntRegIndex _op2,
+                       ConditionCode _condCode, uint8_t _defCc) :
+        FpOp(mnem, _machInst, __opClass),
+        op1(_op1), op2(_op2), condCode(_condCode), defCc(_defCc)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class FpCondSelOp : public FpOp
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    ConditionCode condCode;
+
+    FpCondSelOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                ConditionCode _condCode) :
+        FpOp(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), condCode(_condCode)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
 class FpRegRegOp : public FpOp
@@ -550,6 +979,26 @@ class FpRegRegRegOp : public FpOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class FpRegRegRegRegOp : public FpOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    IntRegIndex op2;
+    IntRegIndex op3;
+
+    FpRegRegRegRegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                     IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                     IntRegIndex _op3, VfpMicroMode mode = VfpNotAMicroop) :
+        FpOp(mnem, _machInst, __opClass), dest(_dest), op1(_op1), op2(_op2),
+        op3(_op3)
+    {
+        setVfpMicroFlags(mode, flags);
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 class FpRegRegRegImmOp : public FpOp
 {
   protected: