From 612f8f074fa1099cf70faf495d46cc647762a031 Mon Sep 17 00:00:00 2001
From: ARM gem5 Developers <none@none>
Date: Fri, 24 Jan 2014 15:29:34 -0600
Subject: arm: Add support for ARMv8 (AArch64 & AArch32)

Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64
kernel you are restricted to AArch64 user-mode binaries. This will be addressed
in a later patch.

Note: Virtualization is only supported in AArch32 mode. This will also be fixed
in a later patch.

Contributors:
Giacomo Gabrielli    (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation)
Thomas Grocutt       (AArch32 Virtualization, AArch64 FP, validation)
Mbou Eyole           (AArch64 NEON, validation)
Ali Saidi            (AArch64 Linux support, code integration, validation)
Edmund Grimley-Evans (AArch64 FP)
William Wang         (AArch64 Linux support)
Rene De Jong         (AArch64 Linux support, performance opt.)
Matt Horsnell        (AArch64 MP, validation)
Matt Evans           (device models, code integration, validation)
Chris Adeniyi-Jones  (AArch64 syscall-emulation)
Prakash Ramrakhyani  (validation)
Dam Sunwoo           (validation)
Chander Sudanthi     (validation)
Stephan Diestelhorst (validation)
Andreas Hansson      (code integration, performance opt.)
Eric Van Hensbergen  (performance opt.)
Gabe Black
---
 src/arch/arm/insts/vfp.hh | 489 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 469 insertions(+), 20 deletions(-)

(limited to 'src/arch/arm/insts/vfp.hh')
diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh
index 9babaae04..f17f90973 100644
--- a/src/arch/arm/insts/vfp.hh
+++ b/src/arch/arm/insts/vfp.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 ARM Limited
+ * Copyright (c) 2010-2013 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -104,7 +104,8 @@ enum VfpRoundingMode
     VfpRoundNearest = 0,
     VfpRoundUpward = 1,
     VfpRoundDown = 2,
-    VfpRoundZero = 3
+    VfpRoundZero = 3,
+    VfpRoundAway = 4
 };
 
 static inline float bitsToFp(uint64_t, float);
@@ -212,7 +213,7 @@ isSnan(fpType val)
 typedef int VfpSavedState;
 
 VfpSavedState prepFpState(uint32_t rMode);
-void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush);
+void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask = FpscrExcMask);
 
 template <class fpType>
 fpType fixDest(FPSCR fpscr, fpType val, fpType op1);
@@ -228,7 +229,11 @@ double fixFpSFpDDest(FPSCR fpscr, float val);
 
 uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
                     uint32_t rMode, bool ahp, float op);
-float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
+uint16_t vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
+                    uint32_t rMode, bool ahp, double op);
+
+float  vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
+double vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op);
 
 static inline double
 makeDouble(uint32_t low, uint32_t high)
@@ -249,19 +254,192 @@ highFromDouble(double val)
     return fpToBits(val) >> 32;
 }
 
-uint64_t vfpFpSToFixed(float val, bool isSigned, bool half,
-                       uint8_t imm, bool rzero = true);
+static inline void
+setFPExceptions(int exceptions) {
+    feclearexcept(FeAllExceptions);
+    feraiseexcept(exceptions);
+}
+
+template <typename T>
+uint64_t
+vfpFpToFixed(T val, bool isSigned, uint8_t width, uint8_t imm, bool
+             useRmode = true, VfpRoundingMode roundMode = VfpRoundZero,
+             bool aarch64 = false)
+{
+    int  rmode;
+    bool roundAwayFix = false;
+
+    if (!useRmode) {
+        rmode = fegetround();
+    } else {
+        switch (roundMode)
+        {
+          case VfpRoundNearest:
+            rmode = FeRoundNearest;
+            break;
+          case VfpRoundUpward:
+            rmode = FeRoundUpward;
+            break;
+          case VfpRoundDown:
+            rmode = FeRoundDown;
+            break;
+          case VfpRoundZero:
+            rmode = FeRoundZero;
+            break;
+          case VfpRoundAway:
+            // There is no equivalent rounding mode, use round down and we'll
+            // fix it later
+            rmode        = FeRoundDown;
+            roundAwayFix = true;
+            break;
+          default:
+            panic("Unsupported roundMode %d\n", roundMode);
+        }
+    }
+    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
+    fesetround(FeRoundNearest);
+    val = val * pow(2.0, imm);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    fesetround(rmode);
+    feclearexcept(FeAllExceptions);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    T origVal = val;
+    val = rint(val);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+
+    int exceptions = fetestexcept(FeAllExceptions);
+
+    int fpType = std::fpclassify(val);
+    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
+        if (fpType == FP_NAN) {
+            exceptions |= FeInvalid;
+        }
+        val = 0.0;
+    } else if (origVal != val) {
+        switch (rmode) {
+          case FeRoundNearest:
+            if (origVal - val > 0.5)
+                val += 1.0;
+            else if (val - origVal > 0.5)
+                val -= 1.0;
+            break;
+          case FeRoundDown:
+            if (roundAwayFix) {
+                // The ordering on the subtraction looks a bit odd in that we
+                // don't do the obvious origVal - val, instead we do
+                // -(val - origVal). This is required to get the corruct bit
+                // exact behaviour when very close to the 0.5 threshold.
+                volatile T error = val;
+                error -= origVal;
+                error = -error;
+                if ( (error >  0.5) ||
+                    ((error == 0.5) && (val >= 0)) )
+                    val += 1.0;
+            } else {
+                if (origVal < val)
+                    val -= 1.0;
+            }
+            break;
+          case FeRoundUpward:
+            if (origVal > val)
+                val += 1.0;
+            break;
+        }
+        exceptions |= FeInexact;
+    }
+
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+
+    if (isSigned) {
+        bool     outOfRange = false;
+        int64_t  result     = (int64_t) val;
+        uint64_t finalVal;
+
+        if (!aarch64) {
+            if (width == 16) {
+                finalVal = (int16_t)val;
+            } else if (width == 32) {
+                finalVal =(int32_t)val;
+            } else if (width == 64) {
+                finalVal = result;
+            } else {
+                panic("Unsupported width %d\n", width);
+            }
+
+            // check if value is in range
+            int64_t minVal = ~mask(width-1);
+            if ((double)val < minVal) {
+                outOfRange = true;
+                finalVal = minVal;
+            }
+            int64_t maxVal = mask(width-1);
+            if ((double)val > maxVal) {
+                outOfRange = true;
+                finalVal = maxVal;
+            }
+        } else {
+            bool isNeg = val < 0;
+            finalVal = result & mask(width);
+            // If the result is supposed to be less than 64 bits check that the
+            // upper bits that got thrown away are just sign extension bits
+            if (width != 64) {
+                outOfRange = ((uint64_t) result >> (width - 1)) !=
+                             (isNeg ? mask(64-width+1) : 0);
+            }
+            // Check if the original floating point value doesn't matches the
+            // integer version we are also out of range. So create a saturated
+            // result.
+            if (isNeg) {
+                outOfRange |= val < result;
+                if (outOfRange) {
+                    finalVal = 1LL << (width-1);
+                }
+            } else {
+                outOfRange |= val > result;
+                if (outOfRange) {
+                    finalVal = mask(width-1);
+                }
+            }
+        }
+
+        // Raise an exception if the value was out of range
+        if (outOfRange) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+        }
+        setFPExceptions(exceptions);
+        return finalVal;
+    } else {
+        if ((double)val < 0) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+            setFPExceptions(exceptions);
+            return 0;
+        }
+
+        uint64_t result = ((uint64_t) val) & mask(width);
+        if (val > result) {
+            exceptions |= FeInvalid;
+            exceptions &= ~FeInexact;
+            setFPExceptions(exceptions);
+            return mask(width);
+        }
+
+        setFPExceptions(exceptions);
+        return result;
+    }
+};
+
+
 float vfpUFixedToFpS(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm);
+        uint64_t val, uint8_t width, uint8_t imm);
 float vfpSFixedToFpS(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm);
+        int64_t val, uint8_t width, uint8_t imm);
 
-uint64_t vfpFpDToFixed(double val, bool isSigned, bool half,
-                       uint8_t imm, bool rzero = true);
 double vfpUFixedToFpD(bool flush, bool defaultNan,
-        uint32_t val, bool half, uint8_t imm);
+        uint64_t val, uint8_t width, uint8_t imm);
 double vfpSFixedToFpD(bool flush, bool defaultNan,
-        int32_t val, bool half, uint8_t imm);
+        int64_t val, uint8_t width, uint8_t imm);
 
 float fprSqrtEstimate(FPSCR &fpscr, float op);
 uint32_t unsignedRSqrtEstimate(uint32_t op);
@@ -292,6 +470,20 @@ class VfpMacroOp : public PredMacroOp
     void nextIdxs(IntRegIndex &dest);
 };
 
+template <typename T>
+static inline T
+fpAdd(T a, T b)
+{
+    return a + b;
+};
+
+template <typename T>
+static inline T
+fpSub(T a, T b)
+{
+    return a - b;
+};
+
 static inline float
 fpAddS(float a, float b)
 {
@@ -328,6 +520,54 @@ fpDivD(double a, double b)
     return a / b;
 }
 
+template <typename T>
+static inline T
+fpDiv(T a, T b)
+{
+    return a / b;
+};
+
+template <typename T>
+static inline T
+fpMulX(T a, T b)
+{
+    uint64_t opData;
+    uint32_t sign1;
+    uint32_t sign2;
+    const bool single = (sizeof(T) == sizeof(float));
+    if (single) {
+        opData = (fpToBits(a));
+        sign1 = opData>>31;
+        opData = (fpToBits(b));
+        sign2 = opData>>31;
+    } else {
+        opData = (fpToBits(a));
+        sign1 = opData>>63;
+        opData = (fpToBits(b));
+        sign2 = opData>>63;
+    }
+    bool inf1 = (std::fpclassify(a) == FP_INFINITE);
+    bool inf2 = (std::fpclassify(b) == FP_INFINITE);
+    bool zero1 = (std::fpclassify(a) == FP_ZERO);
+    bool zero2 = (std::fpclassify(b) == FP_ZERO);
+    if ((inf1 && zero2) || (zero1 && inf2)) {
+        if(sign1 ^ sign2)
+            return (T)(-2.0);
+        else
+            return (T)(2.0);
+    } else {
+        return (a * b);
+    }
+};
+
+
+template <typename T>
+static inline T
+fpMul(T a, T b)
+{
+    return a * b;
+};
+
 static inline float
 fpMulS(float a, float b)
 {
@@ -340,23 +580,140 @@ fpMulD(double a, double b)
     return a * b;
 }
 
-static inline float
-fpMaxS(float a, float b)
+template <typename T>
+static inline T
+// @todo remove this when all calls to it have been replaced with the new fplib implementation
+fpMulAdd(T op1, T op2, T addend)
+{
+    T result;
+
+    if (sizeof(T) == sizeof(float))
+        result = fmaf(op1, op2, addend);
+    else
+        result = fma(op1, op2, addend);
+
+    // ARM doesn't generate signed nan's from this opperation, so fix up the result
+    if (std::isnan(result) && !std::isnan(op1) &&
+        !std::isnan(op2) && !std::isnan(addend))
+    {
+        uint64_t bitMask = ULL(0x1) << ((sizeof(T) * 8) - 1);
+        result = bitsToFp(fpToBits(result) & ~bitMask, op1);
+    }
+    return result;
+}
+
+template <typename T>
+static inline T
+fpRIntX(T a, FPSCR &fpscr)
+{
+    T rVal;
+
+    rVal = rint(a);
+    if (rVal != a && !std::isnan(a))
+        fpscr.ixc = 1;
+    return (rVal);
+};
+
+template <typename T>
+static inline T
+fpMaxNum(T a, T b)
 {
+    const bool     single = (sizeof(T) == sizeof(float));
+    const uint64_t qnan   = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+
+    if (std::isnan(a))
+        return ((fpToBits(a) & qnan) == qnan) ? b : a;
+    if (std::isnan(b))
+        return ((fpToBits(b) & qnan) == qnan) ? a : b;
     // Handle comparisons of +0 and -0.
     if (!std::signbit(a) && std::signbit(b))
         return a;
-    return fmaxf(a, b);
-}
+    return fmax(a, b);
+};
 
-static inline float
-fpMinS(float a, float b)
+template <typename T>
+static inline T
+fpMax(T a, T b)
 {
+    if (std::isnan(a))
+        return a;
+    if (std::isnan(b))
+        return b;
+    return fpMaxNum<T>(a, b);
+};
+
+template <typename T>
+static inline T
+fpMinNum(T a, T b)
+{
+    const bool     single = (sizeof(T) == sizeof(float));
+    const uint64_t qnan   = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
+
+    if (std::isnan(a))
+        return ((fpToBits(a) & qnan) == qnan) ? b : a;
+    if (std::isnan(b))
+        return ((fpToBits(b) & qnan) == qnan) ? a : b;
     // Handle comparisons of +0 and -0.
     if (std::signbit(a) && !std::signbit(b))
         return a;
-    return fminf(a, b);
-}
+    return fmin(a, b);
+};
+
+template <typename T>
+static inline T
+fpMin(T a, T b)
+{
+    if (std::isnan(a))
+        return a;
+    if (std::isnan(b))
+        return b;
+    return fpMinNum<T>(a, b);
+};
+
+template <typename T>
+static inline T
+fpRSqrts(T a, T b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    T aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 1.5;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 1.5;
+    }
+    return (3.0 - (a * b)) / 2.0;
+};
+
+template <typename T>
+static inline T
+fpRecps(T a, T b)
+{
+    int fpClassA = std::fpclassify(a);
+    int fpClassB = std::fpclassify(b);
+    T aXb;
+    int fpClassAxB;
+
+    if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) ||
+        (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) {
+        return 2.0;
+    }
+    aXb = a*b;
+    fpClassAxB = std::fpclassify(aXb);
+    if(fpClassAxB == FP_SUBNORMAL) {
+       feraiseexcept(FeUnderflow);
+       return 2.0;
+    }
+    return 2.0 - (a * b);
+};
+
 
 static inline float
 fpRSqrtsS(float a, float b)
@@ -400,6 +757,23 @@ fpRecpsS(float a, float b)
     return 2.0 - (a * b);
 }
 
+template <typename T>
+static inline T
+roundNEven(T a) {
+    T val;
+
+    val = round(a);
+    if (a - val == 0.5) {
+        if ( (((int) a) & 1) == 0 ) val += 1.0;
+    }
+    else if (a - val == -0.5) {
+        if ( (((int) a) & 1) == 0 ) val -= 1.0;
+    }
+    return val;
+}
+
+
+
 class FpOp : public PredOp
 {
   protected:
@@ -455,6 +829,12 @@ class FpOp : public PredOp
     processNans(FPSCR &fpscr, bool &done, bool defaultNan,
                 fpType op1, fpType op2) const;
 
+    template <class fpType>
+    fpType
+    ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
+              fpType (*func)(fpType, fpType, fpType),
+              bool flush, bool defaultNan, uint32_t rMode) const;
+
     template <class fpType>
     fpType
     binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
@@ -478,6 +858,55 @@ class FpOp : public PredOp
             pcState.advance();
         }
     }
+
+    float
+    fpSqrt (FPSCR fpscr,float x) const
+    {
+
+        return unaryOp(fpscr,x,sqrtf,fpscr.fz,fpscr.rMode);
+
+    }
+
+    double
+    fpSqrt (FPSCR fpscr,double x) const
+    {
+
+        return unaryOp(fpscr,x,sqrt,fpscr.fz,fpscr.rMode);
+
+    }
+};
+
+class FpCondCompRegOp : public FpOp
+{
+  protected:
+    IntRegIndex op1, op2;
+    ConditionCode condCode;
+    uint8_t defCc;
+
+    FpCondCompRegOp(const char *mnem, ExtMachInst _machInst,
+                       OpClass __opClass, IntRegIndex _op1, IntRegIndex _op2,
+                       ConditionCode _condCode, uint8_t _defCc) :
+        FpOp(mnem, _machInst, __opClass),
+        op1(_op1), op2(_op2), condCode(_condCode), defCc(_defCc)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
+class FpCondSelOp : public FpOp
+{
+  protected:
+    IntRegIndex dest, op1, op2;
+    ConditionCode condCode;
+
+    FpCondSelOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                ConditionCode _condCode) :
+        FpOp(mnem, _machInst, __opClass),
+        dest(_dest), op1(_op1), op2(_op2), condCode(_condCode)
+    {}
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
 class FpRegRegOp : public FpOp
@@ -550,6 +979,26 @@ class FpRegRegRegOp : public FpOp
     std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
 };
 
+class FpRegRegRegRegOp : public FpOp
+{
+  protected:
+    IntRegIndex dest;
+    IntRegIndex op1;
+    IntRegIndex op2;
+    IntRegIndex op3;
+
+    FpRegRegRegRegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                     IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2,
+                     IntRegIndex _op3, VfpMicroMode mode = VfpNotAMicroop) :
+        FpOp(mnem, _machInst, __opClass), dest(_dest), op1(_op1), op2(_op2),
+        op3(_op3)
+    {
+        setVfpMicroFlags(mode, flags);
+    }
+
+    std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const;
+};
+
 class FpRegRegRegImmOp : public FpOp
 {
   protected:
-- 
cgit v1.2.3