diff options
author | ARM gem5 Developers <none@none> | 2014-01-24 15:29:34 -0600 |
---|---|---|
committer | ARM gem5 Developers <none@none> | 2014-01-24 15:29:34 -0600 |
commit | 612f8f074fa1099cf70faf495d46cc647762a031 (patch) | |
tree | bd1e99c43bf15292395eadd4b7ae3f5c823545c3 /src/arch/arm/insts/vfp.cc | |
parent | f3585c841e964c98911784a187fc4f081a02a0a6 (diff) | |
download | gem5-612f8f074fa1099cf70faf495d46cc647762a031.tar.xz |
arm: Add support for ARMv8 (AArch64 & AArch32)
Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64
kernel you are restricted to AArch64 user-mode binaries. This will be addressed
in a later patch.
Note: Virtualization is only supported in AArch32 mode. This will also be fixed
in a later patch.
Contributors:
Giacomo Gabrielli (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation)
Thomas Grocutt (AArch32 Virtualization, AArch64 FP, validation)
Mbou Eyole (AArch64 NEON, validation)
Ali Saidi (AArch64 Linux support, code integration, validation)
Edmund Grimley-Evans (AArch64 FP)
William Wang (AArch64 Linux support)
Rene De Jong (AArch64 Linux support, performance opt.)
Matt Horsnell (AArch64 MP, validation)
Matt Evans (device models, code integration, validation)
Chris Adeniyi-Jones (AArch64 syscall-emulation)
Prakash Ramrakhyani (validation)
Dam Sunwoo (validation)
Chander Sudanthi (validation)
Stephan Diestelhorst (validation)
Andreas Hansson (code integration, performance opt.)
Eric Van Hensbergen (performance opt.)
Gabe Black
Diffstat (limited to 'src/arch/arm/insts/vfp.cc')
-rw-r--r-- | src/arch/arm/insts/vfp.cc | 484 |
1 files changed, 253 insertions, 231 deletions
diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc index ca0f58226..03fdc83fa 100644 --- a/src/arch/arm/insts/vfp.cc +++ b/src/arch/arm/insts/vfp.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 ARM Limited + * Copyright (c) 2010-2013 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -46,6 +46,37 @@ */ std::string +FpCondCompRegOp::generateDisassembly( + Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss, "", false); + printReg(ss, op1); + ccprintf(ss, ", "); + printReg(ss, op2); + ccprintf(ss, ", #%d", defCc); + ccprintf(ss, ", "); + printCondition(ss, condCode, true); + return ss.str(); +} + +std::string +FpCondSelOp::generateDisassembly( + Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss, "", false); + printReg(ss, dest); + ccprintf(ss, ", "); + printReg(ss, op1); + ccprintf(ss, ", "); + printReg(ss, op2); + ccprintf(ss, ", "); + printCondition(ss, condCode, true); + return ss.str(); +} + +std::string FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; @@ -92,6 +123,21 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const } std::string +FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const +{ + std::stringstream ss; + printMnemonic(ss); + printReg(ss, dest + FP_Reg_Base); + ss << ", "; + printReg(ss, op1 + FP_Reg_Base); + ss << ", "; + printReg(ss, op2 + FP_Reg_Base); + ss << ", "; + printReg(ss, op3 + FP_Reg_Base); + return ss.str(); +} + +std::string FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; @@ -131,24 +177,25 @@ prepFpState(uint32_t rMode) } void -finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush) +finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) { int exceptions = fetestexcept(FeAllExceptions); bool underflow = false; - if (exceptions & FeInvalid) { + if ((exceptions & FeInvalid) && mask.ioc) { fpscr.ioc = 1; } - if (exceptions & FeDivByZero) { + if ((exceptions & FeDivByZero) && mask.dzc) { fpscr.dzc = 1; } - if (exceptions & FeOverflow) { + if ((exceptions & FeOverflow) && mask.ofc) { fpscr.ofc = 1; } if (exceptions & FeUnderflow) { underflow = true; - fpscr.ufc = 1; + if (mask.ufc) + fpscr.ufc = 1; } - if ((exceptions & FeInexact) && !(underflow && flush)) { + if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { fpscr.ixc = 1; } fesetround(state); @@ -329,19 +376,33 @@ fixFpSFpDDest(FPSCR fpscr, float val) return mid; } -uint16_t -vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, - uint32_t rMode, bool ahp, float op) +static inline uint16_t +vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) { - uint32_t opBits = fpToBits(op); + uint32_t mWidth; + uint32_t eWidth; + uint32_t eHalfRange; + uint32_t sBitPos; + + if (isDouble) { + mWidth = 52; + eWidth = 11; + } else { + mWidth = 23; + eWidth = 8; + } + sBitPos = eWidth + mWidth; + eHalfRange = (1 << (eWidth-1)) - 1; + // Extract the operand. - bool neg = bits(opBits, 31); - uint32_t exponent = bits(opBits, 30, 23); - uint32_t oldMantissa = bits(opBits, 22, 0); - uint32_t mantissa = oldMantissa >> (23 - 10); + bool neg = bits(opBits, sBitPos); + uint32_t exponent = bits(opBits, sBitPos-1, mWidth); + uint64_t oldMantissa = bits(opBits, mWidth-1, 0); + uint32_t mantissa = oldMantissa >> (mWidth - 10); // Do the conversion. - uint32_t extra = oldMantissa & mask(23 - 10); - if (exponent == 0xff) { + uint64_t extra = oldMantissa & mask(mWidth - 10); + if (exponent == mask(eWidth)) { if (oldMantissa != 0) { // Nans. if (bits(mantissa, 9) == 0) { @@ -379,7 +440,6 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, if (exponent == 0) { // Denormalized. - // If flush to zero is on, this shouldn't happen. assert(!flush); @@ -407,13 +467,13 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, // We need to track the dropped bits differently since // more can be dropped by denormalizing. - bool topOne = bits(extra, 12); - bool restZeros = bits(extra, 11, 0) == 0; + bool topOne = bits(extra, mWidth - 10 - 1); + bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; - if (exponent <= (127 - 15)) { + if (exponent <= (eHalfRange - 15)) { // The result is too small. Denormalize. mantissa |= (1 << 10); - while (mantissa && exponent <= (127 - 15)) { + while (mantissa && exponent <= (eHalfRange - 15)) { restZeros = restZeros && !topOne; topOne = bits(mantissa, 0); mantissa = mantissa >> 1; @@ -424,7 +484,7 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, exponent = 0; } else { // Change bias. - exponent -= (127 - 15); + exponent -= (eHalfRange - 15); } if (exponent == 0 && (inexact || fpscr.ufe)) { @@ -488,155 +548,115 @@ vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, return result; } -float -vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) +uint16_t +vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, float op) { - float junk = 0.0; + uint64_t opBits = fpToBits(op); + return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); +} + +uint16_t +vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, + uint32_t rMode, bool ahp, double op) +{ + uint64_t opBits = fpToBits(op); + return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); +} + +static inline uint64_t +vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) +{ + uint32_t mWidth; + uint32_t eWidth; + uint32_t eHalfRange; + uint32_t sBitPos; + + if (isDouble) { + mWidth = 52; + eWidth = 11; + } else { + mWidth = 23; + eWidth = 8; + } + sBitPos = eWidth + mWidth; + eHalfRange = (1 << (eWidth-1)) - 1; + // Extract the bitfields. bool neg = bits(op, 15); uint32_t exponent = bits(op, 14, 10); - uint32_t mantissa = bits(op, 9, 0); + uint64_t mantissa = bits(op, 9, 0); // Do the conversion. if (exponent == 0) { if (mantissa != 0) { // Normalize the value. - exponent = exponent + (127 - 15) + 1; + exponent = exponent + (eHalfRange - 15) + 1; while (mantissa < (1 << 10)) { mantissa = mantissa << 1; exponent--; } } - mantissa = mantissa << (23 - 10); + mantissa = mantissa << (mWidth - 10); } else if (exponent == 0x1f && !ahp) { // Infinities and nans. - exponent = 0xff; + exponent = mask(eWidth); if (mantissa != 0) { // Nans. - mantissa = mantissa << (23 - 10); - if (bits(mantissa, 22) == 0) { + mantissa = mantissa << (mWidth - 10); + if (bits(mantissa, mWidth-1) == 0) { // Signalling nan. fpscr.ioc = 1; - mantissa |= (1 << 22); + mantissa |= (((uint64_t) 1) << (mWidth-1)); } if (defaultNan) { - mantissa &= ~mask(22); + mantissa &= ~mask(mWidth-1); neg = false; } } } else { - exponent = exponent + (127 - 15); - mantissa = mantissa << (23 - 10); + exponent = exponent + (eHalfRange - 15); + mantissa = mantissa << (mWidth - 10); } // Reassemble the result. - uint32_t result = bits(mantissa, 22, 0); - replaceBits(result, 30, 23, exponent); - if (neg) - result |= (1 << 31); + uint64_t result = bits(mantissa, mWidth-1, 0); + replaceBits(result, sBitPos-1, mWidth, exponent); + if (neg) { + result |= (((uint64_t) 1) << sBitPos); + } + return result; +} + +double +vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) +{ + double junk = 0.0; + uint64_t result; + + result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); return bitsToFp(result, junk); } -uint64_t -vfpFpSToFixed(float val, bool isSigned, bool half, - uint8_t imm, bool rzero) +float +vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) { - int rmode = rzero ? FeRoundZero : fegetround(); - __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); - fesetround(FeRoundNearest); - val = val * powf(2.0, imm); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(rmode); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - float origVal = val; - val = rintf(val); - int fpType = std::fpclassify(val); - if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { - if (fpType == FP_NAN) { - feraiseexcept(FeInvalid); - } - val = 0.0; - } else if (origVal != val) { - switch (rmode) { - case FeRoundNearest: - if (origVal - val > 0.5) - val += 1.0; - else if (val - origVal > 0.5) - val -= 1.0; - break; - case FeRoundDown: - if (origVal < val) - val -= 1.0; - break; - case FeRoundUpward: - if (origVal > val) - val += 1.0; - break; - } - feraiseexcept(FeInexact); - } + float junk = 0.0; + uint64_t result; - if (isSigned) { - if (half) { - if ((double)val < (int16_t)(1 << 15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)(1 << 15); - } - if ((double)val > (int16_t)mask(15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)mask(15); - } - return (int16_t)val; - } else { - if ((double)val < (int32_t)(1 << 31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)(1 << 31); - } - if ((double)val > (int32_t)mask(31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)mask(31); - } - return (int32_t)val; - } - } else { - if (half) { - if ((double)val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if ((double)val > (mask(16))) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(16); - } - return (uint16_t)val; - } else { - if ((double)val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if ((double)val > (mask(32))) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(32); - } - return (uint32_t)val; - } - } + result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); + return bitsToFp(result, junk); } float vfpUFixedToFpS(bool flush, bool defaultNan, - uint32_t val, bool half, uint8_t imm) + uint64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); - if (half) + if (width == 16) val = (uint16_t)val; + else if (width == 32) + val = (uint32_t)val; + else if (width != 64) + panic("Unsupported width %d", width); float scale = powf(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); @@ -646,11 +666,16 @@ vfpUFixedToFpS(bool flush, bool defaultNan, float vfpSFixedToFpS(bool flush, bool defaultNan, - int32_t val, bool half, uint8_t imm) + int64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); - if (half) + if (width == 16) val = sext<16>(val & mask(16)); + else if (width == 32) + val = sext<32>(val & mask(32)); + else if (width != 64) + panic("Unsupported width %d", width); + float scale = powf(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); @@ -658,106 +683,19 @@ vfpSFixedToFpS(bool flush, bool defaultNan, return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } -uint64_t -vfpFpDToFixed(double val, bool isSigned, bool half, - uint8_t imm, bool rzero) -{ - int rmode = rzero ? FeRoundZero : fegetround(); - fesetround(FeRoundNearest); - val = val * pow(2.0, imm); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(rmode); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - double origVal = val; - val = rint(val); - int fpType = std::fpclassify(val); - if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { - if (fpType == FP_NAN) { - feraiseexcept(FeInvalid); - } - val = 0.0; - } else if (origVal != val) { - switch (rmode) { - case FeRoundNearest: - if (origVal - val > 0.5) - val += 1.0; - else if (val - origVal > 0.5) - val -= 1.0; - break; - case FeRoundDown: - if (origVal < val) - val -= 1.0; - break; - case FeRoundUpward: - if (origVal > val) - val += 1.0; - break; - } - feraiseexcept(FeInexact); - } - if (isSigned) { - if (half) { - if (val < (int16_t)(1 << 15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)(1 << 15); - } - if (val > (int16_t)mask(15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)mask(15); - } - return (int16_t)val; - } else { - if (val < (int32_t)(1 << 31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)(1 << 31); - } - if (val > (int32_t)mask(31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)mask(31); - } - return (int32_t)val; - } - } else { - if (half) { - if (val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if (val > mask(16)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(16); - } - return (uint16_t)val; - } else { - if (val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if (val > mask(32)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(32); - } - return (uint32_t)val; - } - } -} double vfpUFixedToFpD(bool flush, bool defaultNan, - uint32_t val, bool half, uint8_t imm) + uint64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); - if (half) + if (width == 16) val = (uint16_t)val; + else if (width == 32) + val = (uint32_t)val; + else if (width != 64) + panic("Unsupported width %d", width); + double scale = pow(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); @@ -767,11 +705,16 @@ vfpUFixedToFpD(bool flush, bool defaultNan, double vfpSFixedToFpD(bool flush, bool defaultNan, - int32_t val, bool half, uint8_t imm) + int64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); - if (half) + if (width == 16) val = sext<16>(val & mask(16)); + else if (width == 32) + val = sext<32>(val & mask(32)); + else if (width != 64) + panic("Unsupported width %d", width); + double scale = pow(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); @@ -976,6 +919,85 @@ template double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, double op1, double op2) const; +// @TODO remove this function when we've finished switching all FMA code to use the new FPLIB +template <class fpType> +fpType +FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, + fpType (*func)(fpType, fpType, fpType), + bool flush, bool defaultNan, uint32_t rMode) const +{ + const bool single = (sizeof(fpType) == sizeof(float)); + fpType junk = 0.0; + + if (flush && (flushToZero(op1, op2) || flushToZero(op3))) + fpscr.idc = 1; + VfpSavedState state = prepFpState(rMode); + __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) + : "m" (op1), "m" (op2), "m" (op3), "m" (state)); + fpType dest = func(op1, op2, op3); + __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); + + int fpClass = std::fpclassify(dest); + // Get NAN behavior right. This varies between x86 and ARM. + if (fpClass == FP_NAN) { + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan1 = std::isnan(op1); + const bool nan2 = std::isnan(op2); + const bool nan3 = std::isnan(op3); + const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); + const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); + const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); + if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) { + dest = bitsToFp(qnan, junk); + } else if (signal1) { + dest = bitsToFp(fpToBits(op1) | qnan, junk); + } else if (signal2) { + dest = bitsToFp(fpToBits(op2) | qnan, junk); + } else if (signal3) { + dest = bitsToFp(fpToBits(op3) | qnan, junk); + } else if (nan1) { + dest = op1; + } else if (nan2) { + dest = op2; + } else if (nan3) { + dest = op3; + } + } else if (flush && flushToZero(dest)) { + feraiseexcept(FeUnderflow); + } else if (( + (single && (dest == bitsToFp(0x00800000, junk) || + dest == bitsToFp(0x80800000, junk))) || + (!single && + (dest == bitsToFp(ULL(0x0010000000000000), junk) || + dest == bitsToFp(ULL(0x8010000000000000), junk))) + ) && rMode != VfpRoundZero) { + /* + * Correct for the fact that underflow is detected -before- rounding + * in ARM and -after- rounding in x86. + */ + fesetround(FeRoundZero); + __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) + : "m" (op1), "m" (op2), "m" (op3)); + fpType temp = func(op1, op2, op2); + __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); + if (flush && flushToZero(temp)) { + dest = temp; + } + } + finishVfp(fpscr, state, flush); + return dest; +} + +template +float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, + float (*func)(float, float, float), + bool flush, bool defaultNan, uint32_t rMode) const; +template +double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, + double (*func)(double, double, double), + bool flush, bool defaultNan, uint32_t rMode) const; + template <class fpType> fpType FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, |