diff options
-rw-r--r-- | src/arch/arm/insts/vfp.cc | 856 | ||||
-rw-r--r-- | src/arch/arm/insts/vfp.hh | 822 |
2 files changed, 879 insertions, 799 deletions
diff --git a/src/arch/arm/insts/vfp.cc b/src/arch/arm/insts/vfp.cc index a87d57925..163779715 100644 --- a/src/arch/arm/insts/vfp.cc +++ b/src/arch/arm/insts/vfp.cc @@ -84,3 +84,859 @@ FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const printReg(ss, op2 + FP_Base_DepTag); return ss.str(); } + +namespace ArmISA +{ + +VfpSavedState +prepFpState(uint32_t rMode) +{ + int roundingMode = fegetround(); + feclearexcept(FeAllExceptions); + switch (rMode) { + case VfpRoundNearest: + fesetround(FeRoundNearest); + break; + case VfpRoundUpward: + fesetround(FeRoundUpward); + break; + case VfpRoundDown: + fesetround(FeRoundDown); + break; + case VfpRoundZero: + fesetround(FeRoundZero); + break; + } + return roundingMode; +} + +void +finishVfp(FPSCR &fpscr, VfpSavedState state) +{ + int exceptions = fetestexcept(FeAllExceptions); + bool underflow = false; + if (exceptions & FeInvalid) { + fpscr.ioc = 1; + } + if (exceptions & FeDivByZero) { + fpscr.dzc = 1; + } + if (exceptions & FeOverflow) { + fpscr.ofc = 1; + } + if (exceptions & FeUnderflow) { + underflow = true; + fpscr.ufc = 1; + } + if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) { + fpscr.ixc = 1; + } + fesetround(state); +} + +template <class fpType> +fpType +fixDest(FPSCR fpscr, fpType val, fpType op1) +{ + int fpClass = std::fpclassify(val); + fpType junk = 0.0; + if (fpClass == FP_NAN) { + const bool single = (sizeof(val) == sizeof(float)); + const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan = std::isnan(op1); + if (!nan || (fpscr.dn == 1)) { + val = bitsToFp(qnan, junk); + } else if (nan) { + val = bitsToFp(fpToBits(op1) | qnan, junk); + } + } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + // Turn val into a zero with the correct sign; + uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); + val = bitsToFp(fpToBits(val) & bitMask, junk); + feclearexcept(FeInexact); + feraiseexcept(FeUnderflow); + } + return val; +} + +template +float fixDest<float>(FPSCR fpscr, float val, float op1); +template +double fixDest<double>(FPSCR fpscr, double val, double op1); + +template <class fpType> +fpType +fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +{ + int fpClass = std::fpclassify(val); + fpType junk = 0.0; + if (fpClass == FP_NAN) { + const bool single = (sizeof(val) == sizeof(float)); + const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan1 = std::isnan(op1); + const bool nan2 = std::isnan(op2); + const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); + const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); + if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + val = bitsToFp(qnan, junk); + } else if (signal1) { + val = bitsToFp(fpToBits(op1) | qnan, junk); + } else if (signal2) { + val = bitsToFp(fpToBits(op2) | qnan, junk); + } else if (nan1) { + val = op1; + } else if (nan2) { + val = op2; + } + } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { + // Turn val into a zero with the correct sign; + uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); + val = bitsToFp(fpToBits(val) & bitMask, junk); + feclearexcept(FeInexact); + feraiseexcept(FeUnderflow); + } + return val; +} + +template +float fixDest<float>(FPSCR fpscr, float val, float op1, float op2); +template +double fixDest<double>(FPSCR fpscr, double val, double op1, double op2); + +template <class fpType> +fpType +fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) +{ + fpType mid = fixDest(fpscr, val, op1, op2); + const bool single = (sizeof(fpType) == sizeof(float)); + const fpType junk = 0.0; + if ((single && (val == bitsToFp(0x00800000, junk) || + val == bitsToFp(0x80800000, junk))) || + (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || + val == bitsToFp(ULL(0x8010000000000000), junk))) + ) { + __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); + fesetround(FeRoundZero); + fpType temp = 0.0; + __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); + temp = op1 / op2; + if (flushToZero(temp)) { + feraiseexcept(FeUnderflow); + if (fpscr.fz) { + feclearexcept(FeInexact); + mid = temp; + } + } + __asm__ __volatile__("" :: "m" (temp)); + } + return mid; +} + +template +float fixDivDest<float>(FPSCR fpscr, float val, float op1, float op2); +template +double fixDivDest<double>(FPSCR fpscr, double val, double op1, double op2); + +float +fixFpDFpSDest(FPSCR fpscr, double val) +{ + const float junk = 0.0; + float op1 = 0.0; + if (std::isnan(val)) { + uint64_t valBits = fpToBits(val); + uint32_t op1Bits = bits(valBits, 50, 29) | + (mask(9) << 22) | + (bits(valBits, 63) << 31); + op1 = bitsToFp(op1Bits, junk); + } + float mid = fixDest(fpscr, (float)val, op1); + if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == + (FeUnderflow | FeInexact)) { + feclearexcept(FeInexact); + } + if (mid == bitsToFp(0x00800000, junk) || + mid == bitsToFp(0x80800000, junk)) { + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(FeRoundZero); + float temp = 0.0; + __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); + temp = val; + if (flushToZero(temp)) { + feraiseexcept(FeUnderflow); + if (fpscr.fz) { + feclearexcept(FeInexact); + mid = temp; + } + } + __asm__ __volatile__("" :: "m" (temp)); + } + return mid; +} + +double +fixFpSFpDDest(FPSCR fpscr, float val) +{ + const double junk = 0.0; + double op1 = 0.0; + if (std::isnan(val)) { + uint32_t valBits = fpToBits(val); + uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | + (mask(12) << 51) | + ((uint64_t)bits(valBits, 31) << 63); + op1 = bitsToFp(op1Bits, junk); + } + double mid = fixDest(fpscr, (double)val, op1); + if (mid == bitsToFp(ULL(0x0010000000000000), junk) || + mid == bitsToFp(ULL(0x8010000000000000), junk)) { + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(FeRoundZero); + double temp = 0.0; + __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); + temp = val; + if (flushToZero(temp)) { + feraiseexcept(FeUnderflow); + if (fpscr.fz) { + feclearexcept(FeInexact); + mid = temp; + } + } + __asm__ __volatile__("" :: "m" (temp)); + } + return mid; +} + +float +vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) +{ + float junk = 0.0; + uint32_t destBits = fpToBits(dest); + uint32_t opBits = fpToBits(op); + // Extract the operand. + bool neg = bits(opBits, 31); + uint32_t exponent = bits(opBits, 30, 23); + uint32_t oldMantissa = bits(opBits, 22, 0); + uint32_t mantissa = oldMantissa >> (23 - 10); + // Do the conversion. + uint32_t extra = oldMantissa & mask(23 - 10); + if (exponent == 0xff) { + if (oldMantissa != 0) { + // Nans. + if (bits(mantissa, 9) == 0) { + // Signalling nan. + fpscr.ioc = 1; + } + if (fpscr.ahp) { + mantissa = 0; + exponent = 0; + fpscr.ioc = 1; + } else if (fpscr.dn) { + mantissa = (1 << 9); + exponent = 0x1f; + neg = false; + } else { + exponent = 0x1f; + mantissa |= (1 << 9); + } + } else { + // Infinities. + exponent = 0x1F; + if (fpscr.ahp) { + fpscr.ioc = 1; + mantissa = 0x3ff; + } else { + mantissa = 0; + } + } + } else if (exponent == 0 && oldMantissa == 0) { + // Zero, don't need to do anything. + } else { + // Normalized or denormalized numbers. + + bool inexact = (extra != 0); + + if (exponent == 0) { + // Denormalized. + + // If flush to zero is on, this shouldn't happen. + assert(fpscr.fz == 0); + + // Check for underflow + if (inexact || fpscr.ufe) + fpscr.ufc = 1; + + // Handle rounding. + unsigned mode = fpscr.rMode; + if ((mode == VfpRoundUpward && !neg && extra) || + (mode == VfpRoundDown && neg && extra) || + (mode == VfpRoundNearest && + (extra > (1 << 9) || + (extra == (1 << 9) && bits(mantissa, 0))))) { + mantissa++; + } + + // See if the number became normalized after rounding. + if (mantissa == (1 << 10)) { + mantissa = 0; + exponent = 1; + } + } else { + // Normalized. + + // We need to track the dropped bits differently since + // more can be dropped by denormalizing. + bool topOne = bits(extra, 12); + bool restZeros = bits(extra, 11, 0) == 0; + + if (exponent <= (127 - 15)) { + // The result is too small. Denormalize. + mantissa |= (1 << 10); + while (mantissa && exponent <= (127 - 15)) { + restZeros = restZeros && !topOne; + topOne = bits(mantissa, 0); + mantissa = mantissa >> 1; + exponent++; + } + if (topOne || !restZeros) + inexact = true; + exponent = 0; + } else { + // Change bias. + exponent -= (127 - 15); + } + + if (exponent == 0 && (inexact || fpscr.ufe)) { + // Underflow + fpscr.ufc = 1; + } + + // Handle rounding. + unsigned mode = fpscr.rMode; + bool nonZero = topOne || !restZeros; + if ((mode == VfpRoundUpward && !neg && nonZero) || + (mode == VfpRoundDown && neg && nonZero) || + (mode == VfpRoundNearest && topOne && + (!restZeros || bits(mantissa, 0)))) { + mantissa++; + } + + // See if we rounded up and need to bump the exponent. + if (mantissa == (1 << 10)) { + mantissa = 0; + exponent++; + } + + // Deal with overflow + if (fpscr.ahp) { + if (exponent >= 0x20) { + exponent = 0x1f; + mantissa = 0x3ff; + fpscr.ioc = 1; + // Supress inexact exception. + inexact = false; + } + } else { + if (exponent >= 0x1f) { + if ((mode == VfpRoundNearest) || + (mode == VfpRoundUpward && !neg) || + (mode == VfpRoundDown && neg)) { + // Overflow to infinity. + exponent = 0x1f; + mantissa = 0; + } else { + // Overflow to max normal. + exponent = 0x1e; + mantissa = 0x3ff; + } + fpscr.ofc = 1; + inexact = true; + } + } + } + + if (inexact) { + fpscr.ixc = 1; + } + } + // Reassemble and install the result. + uint32_t result = bits(mantissa, 9, 0); + replaceBits(result, 14, 10, exponent); + if (neg) + result |= (1 << 15); + if (top) + replaceBits(destBits, 31, 16, result); + else + replaceBits(destBits, 15, 0, result); + return bitsToFp(destBits, junk); +} + +float +vcvtFpHFpS(FPSCR &fpscr, float op, bool top) +{ + float junk = 0.0; + uint32_t opBits = fpToBits(op); + // Extract the operand. + if (top) + opBits = bits(opBits, 31, 16); + else + opBits = bits(opBits, 15, 0); + // Extract the bitfields. + bool neg = bits(opBits, 15); + uint32_t exponent = bits(opBits, 14, 10); + uint32_t mantissa = bits(opBits, 9, 0); + // Do the conversion. + if (exponent == 0) { + if (mantissa != 0) { + // Normalize the value. + exponent = exponent + (127 - 15) + 1; + while (mantissa < (1 << 10)) { + mantissa = mantissa << 1; + exponent--; + } + } + mantissa = mantissa << (23 - 10); + } else if (exponent == 0x1f && !fpscr.ahp) { + // Infinities and nans. + exponent = 0xff; + if (mantissa != 0) { + // Nans. + mantissa = mantissa << (23 - 10); + if (bits(mantissa, 22) == 0) { + // Signalling nan. + fpscr.ioc = 1; + mantissa |= (1 << 22); + } + if (fpscr.dn) { + mantissa &= ~mask(22); + neg = false; + } + } + } else { + exponent = exponent + (127 - 15); + mantissa = mantissa << (23 - 10); + } + // Reassemble the result. + uint32_t result = bits(mantissa, 22, 0); + replaceBits(result, 30, 23, exponent); + if (neg) + result |= (1 << 31); + return bitsToFp(result, junk); +} + +uint64_t +vfpFpSToFixed(float val, bool isSigned, bool half, + uint8_t imm, bool rzero) +{ + int rmode = rzero ? FeRoundZero : fegetround(); + __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); + fesetround(FeRoundNearest); + val = val * powf(2.0, imm); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(rmode); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + float origVal = val; + val = rintf(val); + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + feraiseexcept(FeInvalid); + } + val = 0.0; + } else if (origVal != val) { + switch (rmode) { + case FeRoundNearest: + if (origVal - val > 0.5) + val += 1.0; + else if (val - origVal > 0.5) + val -= 1.0; + break; + case FeRoundDown: + if (origVal < val) + val -= 1.0; + break; + case FeRoundUpward: + if (origVal > val) + val += 1.0; + break; + } + feraiseexcept(FeInexact); + } + + if (isSigned) { + if (half) { + if ((double)val < (int16_t)(1 << 15)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int16_t)(1 << 15); + } + if ((double)val > (int16_t)mask(15)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int16_t)mask(15); + } + return (int16_t)val; + } else { + if ((double)val < (int32_t)(1 << 31)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int32_t)(1 << 31); + } + if ((double)val > (int32_t)mask(31)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int32_t)mask(31); + } + return (int32_t)val; + } + } else { + if (half) { + if ((double)val < 0) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return 0; + } + if ((double)val > (mask(16))) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return mask(16); + } + return (uint16_t)val; + } else { + if ((double)val < 0) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return 0; + } + if ((double)val > (mask(32))) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return mask(32); + } + return (uint32_t)val; + } + } +} + +float +vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +{ + fesetround(FeRoundNearest); + if (half) + val = (uint16_t)val; + float scale = powf(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return fixDivDest(fpscr, val / scale, (float)val, scale); +} + +float +vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +{ + fesetround(FeRoundNearest); + if (half) + val = sext<16>(val & mask(16)); + float scale = powf(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return fixDivDest(fpscr, val / scale, (float)val, scale); +} + +uint64_t +vfpFpDToFixed(double val, bool isSigned, bool half, + uint8_t imm, bool rzero) +{ + int rmode = rzero ? FeRoundZero : fegetround(); + fesetround(FeRoundNearest); + val = val * pow(2.0, imm); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(rmode); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + double origVal = val; + val = rint(val); + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + feraiseexcept(FeInvalid); + } + val = 0.0; + } else if (origVal != val) { + switch (rmode) { + case FeRoundNearest: + if (origVal - val > 0.5) + val += 1.0; + else if (val - origVal > 0.5) + val -= 1.0; + break; + case FeRoundDown: + if (origVal < val) + val -= 1.0; + break; + case FeRoundUpward: + if (origVal > val) + val += 1.0; + break; + } + feraiseexcept(FeInexact); + } + if (isSigned) { + if (half) { + if (val < (int16_t)(1 << 15)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int16_t)(1 << 15); + } + if (val > (int16_t)mask(15)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int16_t)mask(15); + } + return (int16_t)val; + } else { + if (val < (int32_t)(1 << 31)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int32_t)(1 << 31); + } + if (val > (int32_t)mask(31)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return (int32_t)mask(31); + } + return (int32_t)val; + } + } else { + if (half) { + if (val < 0) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return 0; + } + if (val > mask(16)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return mask(16); + } + return (uint16_t)val; + } else { + if (val < 0) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return 0; + } + if (val > mask(32)) { + feraiseexcept(FeInvalid); + feclearexcept(FeInexact); + return mask(32); + } + return (uint32_t)val; + } + } +} + +double +vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) +{ + fesetround(FeRoundNearest); + if (half) + val = (uint16_t)val; + double scale = pow(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return fixDivDest(fpscr, val / scale, (double)val, scale); +} + +double +vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) +{ + fesetround(FeRoundNearest); + if (half) + val = sext<16>(val & mask(16)); + double scale = pow(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return fixDivDest(fpscr, val / scale, (double)val, scale); +} + +template <class fpType> +fpType +FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, + fpType (*func)(fpType, fpType), + bool flush, uint32_t rMode) const +{ + const bool single = (sizeof(fpType) == sizeof(float)); + fpType junk = 0.0; + + if (flush && flushToZero(op1, op2)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(rMode); + __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) + : "m" (op1), "m" (op2), "m" (state)); + fpType dest = func(op1, op2); + __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); + + int fpClass = std::fpclassify(dest); + // Get NAN behavior right. This varies between x86 and ARM. + if (fpClass == FP_NAN) { + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan1 = std::isnan(op1); + const bool nan2 = std::isnan(op2); + const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); + const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); + if ((!nan1 && !nan2) || (fpscr.dn == 1)) { + dest = bitsToFp(qnan, junk); + } else if (signal1) { + dest = bitsToFp(fpToBits(op1) | qnan, junk); + } else if (signal2) { + dest = bitsToFp(fpToBits(op2) | qnan, junk); + } else if (nan1) { + dest = op1; + } else if (nan2) { + dest = op2; + } + } else if (flush && flushToZero(dest)) { + feraiseexcept(FeUnderflow); + } else if (( + (single && (dest == bitsToFp(0x00800000, junk) || + dest == bitsToFp(0x80800000, junk))) || + (!single && + (dest == bitsToFp(ULL(0x0010000000000000), junk) || + dest == bitsToFp(ULL(0x8010000000000000), junk))) + ) && rMode != VfpRoundZero) { + /* + * Correct for the fact that underflow is detected -before- rounding + * in ARM and -after- rounding in x86. + */ + fesetround(FeRoundZero); + __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) + : "m" (op1), "m" (op2)); + fpType temp = func(op1, op2); + __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); + if (flush && flushToZero(temp)) { + dest = temp; + } + } + finishVfp(fpscr, state); + return dest; +} + +template +float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, + float (*func)(float, float), + bool flush, uint32_t rMode) const; +template +double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, + double (*func)(double, double), + bool flush, uint32_t rMode) const; + +template <class fpType> +fpType +FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), + bool flush, uint32_t rMode) const +{ + const bool single = (sizeof(fpType) == sizeof(float)); + fpType junk = 0.0; + + if (flush && flushToZero(op1)) + fpscr.idc = 1; + VfpSavedState state = prepFpState(rMode); + __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) + : "m" (op1), "m" (state)); + fpType dest = func(op1); + __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); + + int fpClass = std::fpclassify(dest); + // Get NAN behavior right. This varies between x86 and ARM. + if (fpClass == FP_NAN) { + const bool single = (sizeof(fpType) == sizeof(float)); + const uint64_t qnan = + single ? 0x7fc00000 : ULL(0x7ff8000000000000); + const bool nan = std::isnan(op1); + if (!nan || fpscr.dn == 1) { + dest = bitsToFp(qnan, junk); + } else if (nan) { + dest = bitsToFp(fpToBits(op1) | qnan, junk); + } + } else if (flush && flushToZero(dest)) { + feraiseexcept(FeUnderflow); + } else if (( + (single && (dest == bitsToFp(0x00800000, junk) || + dest == bitsToFp(0x80800000, junk))) || + (!single && + (dest == bitsToFp(ULL(0x0010000000000000), junk) || + dest == bitsToFp(ULL(0x8010000000000000), junk))) + ) && rMode != VfpRoundZero) { + /* + * Correct for the fact that underflow is detected -before- rounding + * in ARM and -after- rounding in x86. + */ + fesetround(FeRoundZero); + __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); + fpType temp = func(op1); + __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); + if (flush && flushToZero(temp)) { + dest = temp; + } + } + finishVfp(fpscr, state); + return dest; +} + +template +float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), + bool flush, uint32_t rMode) const; +template +double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), + bool flush, uint32_t rMode) const; + +IntRegIndex +VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) +{ + if (wide) { + stride *= 2; + } + unsigned offset = idx % 8; + idx = (IntRegIndex)(idx - offset); + offset += stride; + idx = (IntRegIndex)(idx + (offset % 8)); + return idx; +} + +void +VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) +{ + unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; + assert(!inScalarBank(dest)); + dest = addStride(dest, stride); + op1 = addStride(op1, stride); + if (!inScalarBank(op2)) { + op2 = addStride(op2, stride); + } +} + +void +VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) +{ + unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; + assert(!inScalarBank(dest)); + dest = addStride(dest, stride); + if (!inScalarBank(op1)) { + op1 = addStride(op1, stride); + } +} + +void +VfpMacroOp::nextIdxs(IntRegIndex &dest) +{ + unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; + assert(!inScalarBank(dest)); + dest = addStride(dest, stride); +} + +} diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 37553a5dc..57636bbfc 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -194,424 +194,23 @@ bitsToFp(uint64_t bits, double junk) typedef int VfpSavedState; -static inline VfpSavedState -prepFpState(uint32_t rMode) -{ - int roundingMode = fegetround(); - feclearexcept(FeAllExceptions); - switch (rMode) { - case VfpRoundNearest: - fesetround(FeRoundNearest); - break; - case VfpRoundUpward: - fesetround(FeRoundUpward); - break; - case VfpRoundDown: - fesetround(FeRoundDown); - break; - case VfpRoundZero: - fesetround(FeRoundZero); - break; - } - return roundingMode; -} - -static inline void -finishVfp(FPSCR &fpscr, VfpSavedState state) -{ - int exceptions = fetestexcept(FeAllExceptions); - bool underflow = false; - if (exceptions & FeInvalid) { - fpscr.ioc = 1; - } - if (exceptions & FeDivByZero) { - fpscr.dzc = 1; - } - if (exceptions & FeOverflow) { - fpscr.ofc = 1; - } - if (exceptions & FeUnderflow) { - underflow = true; - fpscr.ufc = 1; - } - if ((exceptions & FeInexact) && !(underflow && fpscr.fz)) { - fpscr.ixc = 1; - } - fesetround(state); -} +VfpSavedState prepFpState(uint32_t rMode); +void finishVfp(FPSCR &fpscr, VfpSavedState state); template <class fpType> -static inline fpType -fixDest(FPSCR fpscr, fpType val, fpType op1) -{ - int fpClass = std::fpclassify(val); - fpType junk = 0.0; - if (fpClass == FP_NAN) { - const bool single = (sizeof(val) == sizeof(float)); - const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); - const bool nan = std::isnan(op1); - if (!nan || (fpscr.dn == 1)) { - val = bitsToFp(qnan, junk); - } else if (nan) { - val = bitsToFp(fpToBits(op1) | qnan, junk); - } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { - // Turn val into a zero with the correct sign; - uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); - val = bitsToFp(fpToBits(val) & bitMask, junk); - feclearexcept(FeInexact); - feraiseexcept(FeUnderflow); - } - return val; -} +fpType fixDest(FPSCR fpscr, fpType val, fpType op1); template <class fpType> -static inline fpType -fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) -{ - int fpClass = std::fpclassify(val); - fpType junk = 0.0; - if (fpClass == FP_NAN) { - const bool single = (sizeof(val) == sizeof(float)); - const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); - const bool nan1 = std::isnan(op1); - const bool nan2 = std::isnan(op2); - const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); - const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { - val = bitsToFp(qnan, junk); - } else if (signal1) { - val = bitsToFp(fpToBits(op1) | qnan, junk); - } else if (signal2) { - val = bitsToFp(fpToBits(op2) | qnan, junk); - } else if (nan1) { - val = op1; - } else if (nan2) { - val = op2; - } - } else if (fpClass == FP_SUBNORMAL && fpscr.fz == 1) { - // Turn val into a zero with the correct sign; - uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); - val = bitsToFp(fpToBits(val) & bitMask, junk); - feclearexcept(FeInexact); - feraiseexcept(FeUnderflow); - } - return val; -} +fpType fixDest(FPSCR fpscr, fpType val, fpType op1, fpType op2); template <class fpType> -static inline fpType -fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2) -{ - fpType mid = fixDest(fpscr, val, op1, op2); - const bool single = (sizeof(fpType) == sizeof(float)); - const fpType junk = 0.0; - if ((single && (val == bitsToFp(0x00800000, junk) || - val == bitsToFp(0x80800000, junk))) || - (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || - val == bitsToFp(ULL(0x8010000000000000), junk))) - ) { - __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); - fesetround(FeRoundZero); - fpType temp = 0.0; - __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); - temp = op1 / op2; - if (flushToZero(temp)) { - feraiseexcept(FeUnderflow); - if (fpscr.fz) { - feclearexcept(FeInexact); - mid = temp; - } - } - __asm__ __volatile__("" :: "m" (temp)); - } - return mid; -} +fpType fixDivDest(FPSCR fpscr, fpType val, fpType op1, fpType op2); -static inline float -fixFpDFpSDest(FPSCR fpscr, double val) -{ - const float junk = 0.0; - float op1 = 0.0; - if (std::isnan(val)) { - uint64_t valBits = fpToBits(val); - uint32_t op1Bits = bits(valBits, 50, 29) | - (mask(9) << 22) | - (bits(valBits, 63) << 31); - op1 = bitsToFp(op1Bits, junk); - } - float mid = fixDest(fpscr, (float)val, op1); - if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == - (FeUnderflow | FeInexact)) { - feclearexcept(FeInexact); - } - if (mid == bitsToFp(0x00800000, junk) || - mid == bitsToFp(0x80800000, junk)) { - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(FeRoundZero); - float temp = 0.0; - __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); - temp = val; - if (flushToZero(temp)) { - feraiseexcept(FeUnderflow); - if (fpscr.fz) { - feclearexcept(FeInexact); - mid = temp; - } - } - __asm__ __volatile__("" :: "m" (temp)); - } - return mid; -} - -static inline double -fixFpSFpDDest(FPSCR fpscr, float val) -{ - const double junk = 0.0; - double op1 = 0.0; - if (std::isnan(val)) { - uint32_t valBits = fpToBits(val); - uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | - (mask(12) << 51) | - ((uint64_t)bits(valBits, 31) << 63); - op1 = bitsToFp(op1Bits, junk); - } - double mid = fixDest(fpscr, (double)val, op1); - if (mid == bitsToFp(ULL(0x0010000000000000), junk) || - mid == bitsToFp(ULL(0x8010000000000000), junk)) { - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(FeRoundZero); - double temp = 0.0; - __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); - temp = val; - if (flushToZero(temp)) { - feraiseexcept(FeUnderflow); - if (fpscr.fz) { - feclearexcept(FeInexact); - mid = temp; - } - } - __asm__ __volatile__("" :: "m" (temp)); - } - return mid; -} - -static inline float -vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top) -{ - float junk = 0.0; - uint32_t destBits = fpToBits(dest); - uint32_t opBits = fpToBits(op); - // Extract the operand. - bool neg = bits(opBits, 31); - uint32_t exponent = bits(opBits, 30, 23); - uint32_t oldMantissa = bits(opBits, 22, 0); - uint32_t mantissa = oldMantissa >> (23 - 10); - // Do the conversion. - uint32_t extra = oldMantissa & mask(23 - 10); - if (exponent == 0xff) { - if (oldMantissa != 0) { - // Nans. - if (bits(mantissa, 9) == 0) { - // Signalling nan. - fpscr.ioc = 1; - } - if (fpscr.ahp) { - mantissa = 0; - exponent = 0; - fpscr.ioc = 1; - } else if (fpscr.dn) { - mantissa = (1 << 9); - exponent = 0x1f; - neg = false; - } else { - exponent = 0x1f; - mantissa |= (1 << 9); - } - } else { - // Infinities. - exponent = 0x1F; - if (fpscr.ahp) { - fpscr.ioc = 1; - mantissa = 0x3ff; - } else { - mantissa = 0; - } - } - } else if (exponent == 0 && oldMantissa == 0) { - // Zero, don't need to do anything. - } else { - // Normalized or denormalized numbers. - - bool inexact = (extra != 0); - - if (exponent == 0) { - // Denormalized. - - // If flush to zero is on, this shouldn't happen. - assert(fpscr.fz == 0); - - // Check for underflow - if (inexact || fpscr.ufe) - fpscr.ufc = 1; - - // Handle rounding. - unsigned mode = fpscr.rMode; - if ((mode == VfpRoundUpward && !neg && extra) || - (mode == VfpRoundDown && neg && extra) || - (mode == VfpRoundNearest && - (extra > (1 << 9) || - (extra == (1 << 9) && bits(mantissa, 0))))) { - mantissa++; - } - - // See if the number became normalized after rounding. - if (mantissa == (1 << 10)) { - mantissa = 0; - exponent = 1; - } - } else { - // Normalized. - - // We need to track the dropped bits differently since - // more can be dropped by denormalizing. - bool topOne = bits(extra, 12); - bool restZeros = bits(extra, 11, 0) == 0; - - if (exponent <= (127 - 15)) { - // The result is too small. Denormalize. - mantissa |= (1 << 10); - while (mantissa && exponent <= (127 - 15)) { - restZeros = restZeros && !topOne; - topOne = bits(mantissa, 0); - mantissa = mantissa >> 1; - exponent++; - } - if (topOne || !restZeros) - inexact = true; - exponent = 0; - } else { - // Change bias. - exponent -= (127 - 15); - } - - if (exponent == 0 && (inexact || fpscr.ufe)) { - // Underflow - fpscr.ufc = 1; - } - - // Handle rounding. - unsigned mode = fpscr.rMode; - bool nonZero = topOne || !restZeros; - if ((mode == VfpRoundUpward && !neg && nonZero) || - (mode == VfpRoundDown && neg && nonZero) || - (mode == VfpRoundNearest && topOne && - (!restZeros || bits(mantissa, 0)))) { - mantissa++; - } - - // See if we rounded up and need to bump the exponent. - if (mantissa == (1 << 10)) { - mantissa = 0; - exponent++; - } - - // Deal with overflow - if (fpscr.ahp) { - if (exponent >= 0x20) { - exponent = 0x1f; - mantissa = 0x3ff; - fpscr.ioc = 1; - // Supress inexact exception. - inexact = false; - } - } else { - if (exponent >= 0x1f) { - if ((mode == VfpRoundNearest) || - (mode == VfpRoundUpward && !neg) || - (mode == VfpRoundDown && neg)) { - // Overflow to infinity. - exponent = 0x1f; - mantissa = 0; - } else { - // Overflow to max normal. - exponent = 0x1e; - mantissa = 0x3ff; - } - fpscr.ofc = 1; - inexact = true; - } - } - } - - if (inexact) { - fpscr.ixc = 1; - } - } - // Reassemble and install the result. - uint32_t result = bits(mantissa, 9, 0); - replaceBits(result, 14, 10, exponent); - if (neg) - result |= (1 << 15); - if (top) - replaceBits(destBits, 31, 16, result); - else - replaceBits(destBits, 15, 0, result); - return bitsToFp(destBits, junk); -} +float fixFpDFpSDest(FPSCR fpscr, double val); +double fixFpSFpDDest(FPSCR fpscr, float val); -static inline float -vcvtFpHFpS(FPSCR &fpscr, float op, bool top) -{ - float junk = 0.0; - uint32_t opBits = fpToBits(op); - // Extract the operand. - if (top) - opBits = bits(opBits, 31, 16); - else - opBits = bits(opBits, 15, 0); - // Extract the bitfields. - bool neg = bits(opBits, 15); - uint32_t exponent = bits(opBits, 14, 10); - uint32_t mantissa = bits(opBits, 9, 0); - // Do the conversion. - if (exponent == 0) { - if (mantissa != 0) { - // Normalize the value. - exponent = exponent + (127 - 15) + 1; - while (mantissa < (1 << 10)) { - mantissa = mantissa << 1; - exponent--; - } - } - mantissa = mantissa << (23 - 10); - } else if (exponent == 0x1f && !fpscr.ahp) { - // Infinities and nans. - exponent = 0xff; - if (mantissa != 0) { - // Nans. - mantissa = mantissa << (23 - 10); - if (bits(mantissa, 22) == 0) { - // Signalling nan. - fpscr.ioc = 1; - mantissa |= (1 << 22); - } - if (fpscr.dn) { - mantissa &= ~mask(22); - neg = false; - } - } - } else { - exponent = exponent + (127 - 15); - mantissa = mantissa << (23 - 10); - } - // Reassemble the result. - uint32_t result = bits(mantissa, 22, 0); - replaceBits(result, 30, 23, exponent); - if (neg) - result |= (1 << 31); - return bitsToFp(result, junk); -} +float vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top); +float vcvtFpHFpS(FPSCR &fpscr, float op, bool top); static inline double makeDouble(uint32_t low, uint32_t high) @@ -632,245 +231,15 @@ highFromDouble(double val) return fpToBits(val) >> 32; } -static inline uint64_t -vfpFpSToFixed(float val, bool isSigned, bool half, - uint8_t imm, bool rzero = true) -{ - int rmode = rzero ? FeRoundZero : fegetround(); - __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); - fesetround(FeRoundNearest); - val = val * powf(2.0, imm); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(rmode); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - float origVal = val; - val = rintf(val); - int fpType = std::fpclassify(val); - if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { - if (fpType == FP_NAN) { - feraiseexcept(FeInvalid); - } - val = 0.0; - } else if (origVal != val) { - switch (rmode) { - case FeRoundNearest: - if (origVal - val > 0.5) - val += 1.0; - else if (val - origVal > 0.5) - val -= 1.0; - break; - case FeRoundDown: - if (origVal < val) - val -= 1.0; - break; - case FeRoundUpward: - if (origVal > val) - val += 1.0; - break; - } - feraiseexcept(FeInexact); - } - - if (isSigned) { - if (half) { - if ((double)val < (int16_t)(1 << 15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)(1 << 15); - } - if ((double)val > (int16_t)mask(15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)mask(15); - } - return (int16_t)val; - } else { - if ((double)val < (int32_t)(1 << 31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)(1 << 31); - } - if ((double)val > (int32_t)mask(31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)mask(31); - } - return (int32_t)val; - } - } else { - if (half) { - if ((double)val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if ((double)val > (mask(16))) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(16); - } - return (uint16_t)val; - } else { - if ((double)val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if ((double)val > (mask(32))) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(32); - } - return (uint32_t)val; - } - } -} - -static inline float -vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) -{ - fesetround(FeRoundNearest); - if (half) - val = (uint16_t)val; - float scale = powf(2.0, imm); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); -} - -static inline float -vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm) -{ - fesetround(FeRoundNearest); - if (half) - val = sext<16>(val & mask(16)); - float scale = powf(2.0, imm); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (float)val, scale); -} - -static inline uint64_t -vfpFpDToFixed(double val, bool isSigned, bool half, - uint8_t imm, bool rzero = true) -{ - int rmode = rzero ? FeRoundZero : fegetround(); - fesetround(FeRoundNearest); - val = val * pow(2.0, imm); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - fesetround(rmode); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (val) : "m" (val)); - double origVal = val; - val = rint(val); - int fpType = std::fpclassify(val); - if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { - if (fpType == FP_NAN) { - feraiseexcept(FeInvalid); - } - val = 0.0; - } else if (origVal != val) { - switch (rmode) { - case FeRoundNearest: - if (origVal - val > 0.5) - val += 1.0; - else if (val - origVal > 0.5) - val -= 1.0; - break; - case FeRoundDown: - if (origVal < val) - val -= 1.0; - break; - case FeRoundUpward: - if (origVal > val) - val += 1.0; - break; - } - feraiseexcept(FeInexact); - } - if (isSigned) { - if (half) { - if (val < (int16_t)(1 << 15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)(1 << 15); - } - if (val > (int16_t)mask(15)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int16_t)mask(15); - } - return (int16_t)val; - } else { - if (val < (int32_t)(1 << 31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)(1 << 31); - } - if (val > (int32_t)mask(31)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return (int32_t)mask(31); - } - return (int32_t)val; - } - } else { - if (half) { - if (val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if (val > mask(16)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(16); - } - return (uint16_t)val; - } else { - if (val < 0) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return 0; - } - if (val > mask(32)) { - feraiseexcept(FeInvalid); - feclearexcept(FeInexact); - return mask(32); - } - return (uint32_t)val; - } - } -} - -static inline double -vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm) -{ - fesetround(FeRoundNearest); - if (half) - val = (uint16_t)val; - double scale = pow(2.0, imm); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); -} +uint64_t vfpFpSToFixed(float val, bool isSigned, bool half, + uint8_t imm, bool rzero = true); +float vfpUFixedToFpS(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); +float vfpSFixedToFpS(FPSCR fpscr, int32_t val, bool half, uint8_t imm); -static inline double -vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm) -{ - fesetround(FeRoundNearest); - if (half) - val = sext<16>(val & mask(16)); - double scale = pow(2.0, imm); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - feclearexcept(FeAllExceptions); - __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); - return fixDivDest(fpscr, val / scale, (double)val, scale); -} +uint64_t vfpFpDToFixed(double val, bool isSigned, bool half, + uint8_t imm, bool rzero = true); +double vfpUFixedToFpD(FPSCR fpscr, uint32_t val, bool half, uint8_t imm); +double vfpSFixedToFpD(FPSCR fpscr, int32_t val, bool half, uint8_t imm); class VfpMacroOp : public PredMacroOp { @@ -889,49 +258,10 @@ class VfpMacroOp : public PredMacroOp PredMacroOp(mnem, _machInst, __opClass), wide(_wide) {} - IntRegIndex - addStride(IntRegIndex idx, unsigned stride) - { - if (wide) { - stride *= 2; - } - unsigned offset = idx % 8; - idx = (IntRegIndex)(idx - offset); - offset += stride; - idx = (IntRegIndex)(idx + (offset % 8)); - return idx; - } - - void - nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) - { - unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; - assert(!inScalarBank(dest)); - dest = addStride(dest, stride); - op1 = addStride(op1, stride); - if (!inScalarBank(op2)) { - op2 = addStride(op2, stride); - } - } - - void - nextIdxs(IntRegIndex &dest, IntRegIndex &op1) - { - unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; - assert(!inScalarBank(dest)); - dest = addStride(dest, stride); - if (!inScalarBank(op1)) { - op1 = addStride(op1, stride); - } - } - - void - nextIdxs(IntRegIndex &dest) - { - unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; - assert(!inScalarBank(dest)); - dest = addStride(dest, stride); - } + IntRegIndex addStride(IntRegIndex idx, unsigned stride); + void nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2); + void nextIdxs(IntRegIndex &dest, IntRegIndex &op1); + void nextIdxs(IntRegIndex &dest); }; static inline float @@ -1036,119 +366,13 @@ class FpOp : public PredOp fpType binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), - bool flush, uint32_t rMode) const - { - const bool single = (sizeof(fpType) == sizeof(float)); - fpType junk = 0.0; - - if (flush && flushToZero(op1, op2)) - fpscr.idc = 1; - VfpSavedState state = prepFpState(rMode); - __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) - : "m" (op1), "m" (op2), "m" (state)); - fpType dest = func(op1, op2); - __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); - - int fpClass = std::fpclassify(dest); - // Get NAN behavior right. This varies between x86 and ARM. - if (fpClass == FP_NAN) { - const bool single = (sizeof(fpType) == sizeof(float)); - const uint64_t qnan = - single ? 0x7fc00000 : ULL(0x7ff8000000000000); - const bool nan1 = std::isnan(op1); - const bool nan2 = std::isnan(op2); - const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); - const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); - if ((!nan1 && !nan2) || (fpscr.dn == 1)) { - dest = bitsToFp(qnan, junk); - } else if (signal1) { - dest = bitsToFp(fpToBits(op1) | qnan, junk); - } else if (signal2) { - dest = bitsToFp(fpToBits(op2) | qnan, junk); - } else if (nan1) { - dest = op1; - } else if (nan2) { - dest = op2; - } - } else if (flush && flushToZero(dest)) { - feraiseexcept(FeUnderflow); - } else if (( - (single && (dest == bitsToFp(0x00800000, junk) || - dest == bitsToFp(0x80800000, junk))) || - (!single && - (dest == bitsToFp(ULL(0x0010000000000000), junk) || - dest == bitsToFp(ULL(0x8010000000000000), junk))) - ) && rMode != VfpRoundZero) { - /* - * Correct for the fact that underflow is detected -before- rounding - * in ARM and -after- rounding in x86. - */ - fesetround(FeRoundZero); - __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) - : "m" (op1), "m" (op2)); - fpType temp = func(op1, op2); - __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); - if (flush && flushToZero(temp)) { - dest = temp; - } - } - finishVfp(fpscr, state); - return dest; - } + bool flush, uint32_t rMode) const; template <class fpType> fpType unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), - bool flush, uint32_t rMode) const - { - const bool single = (sizeof(fpType) == sizeof(float)); - fpType junk = 0.0; - - if (flush && flushToZero(op1)) - fpscr.idc = 1; - VfpSavedState state = prepFpState(rMode); - __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) - : "m" (op1), "m" (state)); - fpType dest = func(op1); - __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); - - int fpClass = std::fpclassify(dest); - // Get NAN behavior right. This varies between x86 and ARM. - if (fpClass == FP_NAN) { - const bool single = (sizeof(fpType) == sizeof(float)); - const uint64_t qnan = - single ? 0x7fc00000 : ULL(0x7ff8000000000000); - const bool nan = std::isnan(op1); - if (!nan || fpscr.dn == 1) { - dest = bitsToFp(qnan, junk); - } else if (nan) { - dest = bitsToFp(fpToBits(op1) | qnan, junk); - } - } else if (flush && flushToZero(dest)) { - feraiseexcept(FeUnderflow); - } else if (( - (single && (dest == bitsToFp(0x00800000, junk) || - dest == bitsToFp(0x80800000, junk))) || - (!single && - (dest == bitsToFp(ULL(0x0010000000000000), junk) || - dest == bitsToFp(ULL(0x8010000000000000), junk))) - ) && rMode != VfpRoundZero) { - /* - * Correct for the fact that underflow is detected -before- rounding - * in ARM and -after- rounding in x86. - */ - fesetround(FeRoundZero); - __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); - fpType temp = func(op1); - __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); - if (flush && flushToZero(temp)) { - dest = temp; - } - } - finishVfp(fpscr, state); - return dest; - } + bool flush, uint32_t rMode) const; }; class FpRegRegOp : public FpOp |