From 612f8f074fa1099cf70faf495d46cc647762a031 Mon Sep 17 00:00:00 2001 From: ARM gem5 Developers Date: Fri, 24 Jan 2014 15:29:34 -0600 Subject: arm: Add support for ARMv8 (AArch64 & AArch32) Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64 kernel you are restricted to AArch64 user-mode binaries. This will be addressed in a later patch. Note: Virtualization is only supported in AArch32 mode. This will also be fixed in a later patch. Contributors: Giacomo Gabrielli (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation) Thomas Grocutt (AArch32 Virtualization, AArch64 FP, validation) Mbou Eyole (AArch64 NEON, validation) Ali Saidi (AArch64 Linux support, code integration, validation) Edmund Grimley-Evans (AArch64 FP) William Wang (AArch64 Linux support) Rene De Jong (AArch64 Linux support, performance opt.) Matt Horsnell (AArch64 MP, validation) Matt Evans (device models, code integration, validation) Chris Adeniyi-Jones (AArch64 syscall-emulation) Prakash Ramrakhyani (validation) Dam Sunwoo (validation) Chander Sudanthi (validation) Stephan Diestelhorst (validation) Andreas Hansson (code integration, performance opt.) Eric Van Hensbergen (performance opt.) Gabe Black --- src/arch/arm/insts/vfp.hh | 489 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 469 insertions(+), 20 deletions(-) (limited to 'src/arch/arm/insts/vfp.hh') diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 9babaae04..f17f90973 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 ARM Limited + * Copyright (c) 2010-2013 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -104,7 +104,8 @@ enum VfpRoundingMode VfpRoundNearest = 0, VfpRoundUpward = 1, VfpRoundDown = 2, - VfpRoundZero = 3 + VfpRoundZero = 3, + VfpRoundAway = 4 }; static inline float bitsToFp(uint64_t, float); @@ -212,7 +213,7 @@ isSnan(fpType val) typedef int VfpSavedState; VfpSavedState prepFpState(uint32_t rMode); -void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush); +void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask = FpscrExcMask); template fpType fixDest(FPSCR fpscr, fpType val, fpType op1); @@ -228,7 +229,11 @@ double fixFpSFpDDest(FPSCR fpscr, float val); uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, uint32_t rMode, bool ahp, float op); -float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op); +uint16_t vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, double op); + +float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op); +double vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op); static inline double makeDouble(uint32_t low, uint32_t high) @@ -249,19 +254,192 @@ highFromDouble(double val) return fpToBits(val) >> 32; } -uint64_t vfpFpSToFixed(float val, bool isSigned, bool half, - uint8_t imm, bool rzero = true); +static inline void +setFPExceptions(int exceptions) { + feclearexcept(FeAllExceptions); + feraiseexcept(exceptions); +} + +template +uint64_t +vfpFpToFixed(T val, bool isSigned, uint8_t width, uint8_t imm, bool + useRmode = true, VfpRoundingMode roundMode = VfpRoundZero, + bool aarch64 = false) +{ + int rmode; + bool roundAwayFix = false; + + if (!useRmode) { + rmode = fegetround(); + } else { + switch (roundMode) + { + case VfpRoundNearest: + rmode = FeRoundNearest; + break; + case VfpRoundUpward: + rmode = FeRoundUpward; + break; + case VfpRoundDown: + rmode = FeRoundDown; + break; + case VfpRoundZero: + rmode = FeRoundZero; + break; + case VfpRoundAway: + // There is no equivalent rounding mode, use round down and we'll + // fix it later + rmode = FeRoundDown; + roundAwayFix = true; + break; + default: + panic("Unsupported roundMode %d\n", roundMode); + } + } + __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); + fesetround(FeRoundNearest); + val = val * pow(2.0, imm); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(rmode); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + T origVal = val; + val = rint(val); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + + int exceptions = fetestexcept(FeAllExceptions); + + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + exceptions |= FeInvalid; + } + val = 0.0; + } else if (origVal != val) { + switch (rmode) { + case FeRoundNearest: + if (origVal - val > 0.5) + val += 1.0; + else if (val - origVal > 0.5) + val -= 1.0; + break; + case FeRoundDown: + if (roundAwayFix) { + // The ordering on the subtraction looks a bit odd in that we + // don't do the obvious origVal - val, instead we do + // -(val - origVal). This is required to get the corruct bit + // exact behaviour when very close to the 0.5 threshold. + volatile T error = val; + error -= origVal; + error = -error; + if ( (error > 0.5) || + ((error == 0.5) && (val >= 0)) ) + val += 1.0; + } else { + if (origVal < val) + val -= 1.0; + } + break; + case FeRoundUpward: + if (origVal > val) + val += 1.0; + break; + } + exceptions |= FeInexact; + } + + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + + if (isSigned) { + bool outOfRange = false; + int64_t result = (int64_t) val; + uint64_t finalVal; + + if (!aarch64) { + if (width == 16) { + finalVal = (int16_t)val; + } else if (width == 32) { + finalVal =(int32_t)val; + } else if (width == 64) { + finalVal = result; + } else { + panic("Unsupported width %d\n", width); + } + + // check if value is in range + int64_t minVal = ~mask(width-1); + if ((double)val < minVal) { + outOfRange = true; + finalVal = minVal; + } + int64_t maxVal = mask(width-1); + if ((double)val > maxVal) { + outOfRange = true; + finalVal = maxVal; + } + } else { + bool isNeg = val < 0; + finalVal = result & mask(width); + // If the result is supposed to be less than 64 bits check that the + // upper bits that got thrown away are just sign extension bits + if (width != 64) { + outOfRange = ((uint64_t) result >> (width - 1)) != + (isNeg ? mask(64-width+1) : 0); + } + // Check if the original floating point value doesn't matches the + // integer version we are also out of range. So create a saturated + // result. + if (isNeg) { + outOfRange |= val < result; + if (outOfRange) { + finalVal = 1LL << (width-1); + } + } else { + outOfRange |= val > result; + if (outOfRange) { + finalVal = mask(width-1); + } + } + } + + // Raise an exception if the value was out of range + if (outOfRange) { + exceptions |= FeInvalid; + exceptions &= ~FeInexact; + } + setFPExceptions(exceptions); + return finalVal; + } else { + if ((double)val < 0) { + exceptions |= FeInvalid; + exceptions &= ~FeInexact; + setFPExceptions(exceptions); + return 0; + } + + uint64_t result = ((uint64_t) val) & mask(width); + if (val > result) { + exceptions |= FeInvalid; + exceptions &= ~FeInexact; + setFPExceptions(exceptions); + return mask(width); + } + + setFPExceptions(exceptions); + return result; + } +}; + + float vfpUFixedToFpS(bool flush, bool defaultNan, - uint32_t val, bool half, uint8_t imm); + uint64_t val, uint8_t width, uint8_t imm); float vfpSFixedToFpS(bool flush, bool defaultNan, - int32_t val, bool half, uint8_t imm); + int64_t val, uint8_t width, uint8_t imm); -uint64_t vfpFpDToFixed(double val, bool isSigned, bool half, - uint8_t imm, bool rzero = true); double vfpUFixedToFpD(bool flush, bool defaultNan, - uint32_t val, bool half, uint8_t imm); + uint64_t val, uint8_t width, uint8_t imm); double vfpSFixedToFpD(bool flush, bool defaultNan, - int32_t val, bool half, uint8_t imm); + int64_t val, uint8_t width, uint8_t imm); float fprSqrtEstimate(FPSCR &fpscr, float op); uint32_t unsignedRSqrtEstimate(uint32_t op); @@ -292,6 +470,20 @@ class VfpMacroOp : public PredMacroOp void nextIdxs(IntRegIndex &dest); }; +template +static inline T +fpAdd(T a, T b) +{ + return a + b; +}; + +template +static inline T +fpSub(T a, T b) +{ + return a - b; +}; + static inline float fpAddS(float a, float b) { @@ -328,6 +520,54 @@ fpDivD(double a, double b) return a / b; } +template +static inline T +fpDiv(T a, T b) +{ + return a / b; +}; + +template +static inline T +fpMulX(T a, T b) +{ + uint64_t opData; + uint32_t sign1; + uint32_t sign2; + const bool single = (sizeof(T) == sizeof(float)); + if (single) { + opData = (fpToBits(a)); + sign1 = opData>>31; + opData = (fpToBits(b)); + sign2 = opData>>31; + } else { + opData = (fpToBits(a)); + sign1 = opData>>63; + opData = (fpToBits(b)); + sign2 = opData>>63; + } + bool inf1 = (std::fpclassify(a) == FP_INFINITE); + bool inf2 = (std::fpclassify(b) == FP_INFINITE); + bool zero1 = (std::fpclassify(a) == FP_ZERO); + bool zero2 = (std::fpclassify(b) == FP_ZERO); + if ((inf1 && zero2) || (zero1 && inf2)) { + if(sign1 ^ sign2) + return (T)(-2.0); + else + return (T)(2.0); + } else { + return (a * b); + } +}; + + +template +static inline T +fpMul(T a, T b) +{ + return a * b; +}; + static inline float fpMulS(float a, float b) { @@ -340,23 +580,140 @@ fpMulD(double a, double b) return a * b; } -static inline float -fpMaxS(float a, float b) +template +static inline T +// @todo remove this when all calls to it have been replaced with the new fplib implementation +fpMulAdd(T op1, T op2, T addend) +{ + T result; + + if (sizeof(T) == sizeof(float)) + result = fmaf(op1, op2, addend); + else + result = fma(op1, op2, addend); + + // ARM doesn't generate signed nan's from this opperation, so fix up the result + if (std::isnan(result) && !std::isnan(op1) && + !std::isnan(op2) && !std::isnan(addend)) + { + uint64_t bitMask = ULL(0x1) << ((sizeof(T) * 8) - 1); + result = bitsToFp(fpToBits(result) & ~bitMask, op1); + } + return result; +} + +template +static inline T +fpRIntX(T a, FPSCR &fpscr) +{ + T rVal; + + rVal = rint(a); + if (rVal != a && !std::isnan(a)) + fpscr.ixc = 1; + return (rVal); +}; + +template +static inline T +fpMaxNum(T a, T b) { + const bool single = (sizeof(T) == sizeof(float)); + const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); + + if (std::isnan(a)) + return ((fpToBits(a) & qnan) == qnan) ? b : a; + if (std::isnan(b)) + return ((fpToBits(b) & qnan) == qnan) ? a : b; // Handle comparisons of +0 and -0. if (!std::signbit(a) && std::signbit(b)) return a; - return fmaxf(a, b); -} + return fmax(a, b); +}; -static inline float -fpMinS(float a, float b) +template +static inline T +fpMax(T a, T b) { + if (std::isnan(a)) + return a; + if (std::isnan(b)) + return b; + return fpMaxNum(a, b); +}; + +template +static inline T +fpMinNum(T a, T b) +{ + const bool single = (sizeof(T) == sizeof(float)); + const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); + + if (std::isnan(a)) + return ((fpToBits(a) & qnan) == qnan) ? b : a; + if (std::isnan(b)) + return ((fpToBits(b) & qnan) == qnan) ? a : b; // Handle comparisons of +0 and -0. if (std::signbit(a) && !std::signbit(b)) return a; - return fminf(a, b); -} + return fmin(a, b); +}; + +template +static inline T +fpMin(T a, T b) +{ + if (std::isnan(a)) + return a; + if (std::isnan(b)) + return b; + return fpMinNum(a, b); +}; + +template +static inline T +fpRSqrts(T a, T b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + T aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 1.5; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 1.5; + } + return (3.0 - (a * b)) / 2.0; +}; + +template +static inline T +fpRecps(T a, T b) +{ + int fpClassA = std::fpclassify(a); + int fpClassB = std::fpclassify(b); + T aXb; + int fpClassAxB; + + if ((fpClassA == FP_ZERO && fpClassB == FP_INFINITE) || + (fpClassA == FP_INFINITE && fpClassB == FP_ZERO)) { + return 2.0; + } + aXb = a*b; + fpClassAxB = std::fpclassify(aXb); + if(fpClassAxB == FP_SUBNORMAL) { + feraiseexcept(FeUnderflow); + return 2.0; + } + return 2.0 - (a * b); +}; + static inline float fpRSqrtsS(float a, float b) @@ -400,6 +757,23 @@ fpRecpsS(float a, float b) return 2.0 - (a * b); } +template +static inline T +roundNEven(T a) { + T val; + + val = round(a); + if (a - val == 0.5) { + if ( (((int) a) & 1) == 0 ) val += 1.0; + } + else if (a - val == -0.5) { + if ( (((int) a) & 1) == 0 ) val -= 1.0; + } + return val; +} + + + class FpOp : public PredOp { protected: @@ -455,6 +829,12 @@ class FpOp : public PredOp processNans(FPSCR &fpscr, bool &done, bool defaultNan, fpType op1, fpType op2) const; + template + fpType + ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, + fpType (*func)(fpType, fpType, fpType), + bool flush, bool defaultNan, uint32_t rMode) const; + template fpType binaryOp(FPSCR &fpscr, fpType op1, fpType op2, @@ -478,6 +858,55 @@ class FpOp : public PredOp pcState.advance(); } } + + float + fpSqrt (FPSCR fpscr,float x) const + { + + return unaryOp(fpscr,x,sqrtf,fpscr.fz,fpscr.rMode); + + } + + double + fpSqrt (FPSCR fpscr,double x) const + { + + return unaryOp(fpscr,x,sqrt,fpscr.fz,fpscr.rMode); + + } +}; + +class FpCondCompRegOp : public FpOp +{ + protected: + IntRegIndex op1, op2; + ConditionCode condCode; + uint8_t defCc; + + FpCondCompRegOp(const char *mnem, ExtMachInst _machInst, + OpClass __opClass, IntRegIndex _op1, IntRegIndex _op2, + ConditionCode _condCode, uint8_t _defCc) : + FpOp(mnem, _machInst, __opClass), + op1(_op1), op2(_op2), condCode(_condCode), defCc(_defCc) + {} + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + +class FpCondSelOp : public FpOp +{ + protected: + IntRegIndex dest, op1, op2; + ConditionCode condCode; + + FpCondSelOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2, + ConditionCode _condCode) : + FpOp(mnem, _machInst, __opClass), + dest(_dest), op1(_op1), op2(_op2), condCode(_condCode) + {} + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; }; class FpRegRegOp : public FpOp @@ -550,6 +979,26 @@ class FpRegRegRegOp : public FpOp std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; }; +class FpRegRegRegRegOp : public FpOp +{ + protected: + IntRegIndex dest; + IntRegIndex op1; + IntRegIndex op2; + IntRegIndex op3; + + FpRegRegRegRegOp(const char *mnem, ExtMachInst _machInst, OpClass __opClass, + IntRegIndex _dest, IntRegIndex _op1, IntRegIndex _op2, + IntRegIndex _op3, VfpMicroMode mode = VfpNotAMicroop) : + FpOp(mnem, _machInst, __opClass), dest(_dest), op1(_op1), op2(_op2), + op3(_op3) + { + setVfpMicroFlags(mode, flags); + } + + std::string generateDisassembly(Addr pc, const SymbolTable *symtab) const; +}; + class FpRegRegRegImmOp : public FpOp { protected: -- cgit v1.2.3