/* * Copyright (c) 2010-2013 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall * not be construed as granting a license to any other intellectual * property including but not limited to intellectual property relating * to a hardware implementation of the functionality of the software * licensed hereunder. You may use the software subject to the license * terms below provided that you ensure that this notice is replicated * unmodified and in its entirety in all distributions of the software, * modified or unmodified, in source code or in binary form. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Gabe Black */ #include "arch/arm/insts/vfp.hh" /* * The asm statements below are to keep gcc from reordering code. Otherwise * the rounding mode might be set after the operation it was intended for, the * exception bits read before it, etc. */ std::string FpCondCompRegOp::generateDisassembly( Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss, "", false); printIntReg(ss, op1); ccprintf(ss, ", "); printIntReg(ss, op2); ccprintf(ss, ", #%d", defCc); ccprintf(ss, ", "); printCondition(ss, condCode, true); return ss.str(); } std::string FpCondSelOp::generateDisassembly( Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss, "", false); printIntReg(ss, dest); ccprintf(ss, ", "); printIntReg(ss, op1); ccprintf(ss, ", "); printIntReg(ss, op2); ccprintf(ss, ", "); printCondition(ss, condCode, true); return ss.str(); } std::string FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); return ss.str(); } std::string FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ccprintf(ss, ", #%d", imm); return ss.str(); } std::string FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); ccprintf(ss, ", #%d", imm); return ss.str(); } std::string FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); ss << ", "; printFloatReg(ss, op2); return ss.str(); } std::string FpRegRegRegCondOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printCondition(ss, cond); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); ss << ", "; printFloatReg(ss, op2); return ss.str(); } std::string FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); ss << ", "; printFloatReg(ss, op2); ss << ", "; printFloatReg(ss, op3); return ss.str(); } std::string FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const { std::stringstream ss; printMnemonic(ss); printFloatReg(ss, dest); ss << ", "; printFloatReg(ss, op1); ss << ", "; printFloatReg(ss, op2); ccprintf(ss, ", #%d", imm); return ss.str(); } namespace ArmISA { VfpSavedState prepFpState(uint32_t rMode) { int roundingMode = fegetround(); feclearexcept(FeAllExceptions); switch (rMode) { case VfpRoundNearest: fesetround(FeRoundNearest); break; case VfpRoundUpward: fesetround(FeRoundUpward); break; case VfpRoundDown: fesetround(FeRoundDown); break; case VfpRoundZero: fesetround(FeRoundZero); break; } return roundingMode; } void finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask) { int exceptions = fetestexcept(FeAllExceptions); bool underflow = false; if ((exceptions & FeInvalid) && mask.ioc) { fpscr.ioc = 1; } if ((exceptions & FeDivByZero) && mask.dzc) { fpscr.dzc = 1; } if ((exceptions & FeOverflow) && mask.ofc) { fpscr.ofc = 1; } if (exceptions & FeUnderflow) { underflow = true; if (mask.ufc) fpscr.ufc = 1; } if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) { fpscr.ixc = 1; } fesetround(state); } template fpType fixDest(bool flush, bool defaultNan, fpType val, fpType op1) { int fpClass = std::fpclassify(val); fpType junk = 0.0; if (fpClass == FP_NAN) { const bool single = (sizeof(val) == sizeof(float)); const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan = std::isnan(op1); if (!nan || defaultNan) { val = bitsToFp(qnan, junk); } else if (nan) { val = bitsToFp(fpToBits(op1) | qnan, junk); } } else if (fpClass == FP_SUBNORMAL && flush == 1) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); feclearexcept(FeInexact); feraiseexcept(FeUnderflow); } return val; } template float fixDest(bool flush, bool defaultNan, float val, float op1); template double fixDest(bool flush, bool defaultNan, double val, double op1); template fpType fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { int fpClass = std::fpclassify(val); fpType junk = 0.0; if (fpClass == FP_NAN) { const bool single = (sizeof(val) == sizeof(float)); const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan1 = std::isnan(op1); const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); if ((!nan1 && !nan2) || defaultNan) { val = bitsToFp(qnan, junk); } else if (signal1) { val = bitsToFp(fpToBits(op1) | qnan, junk); } else if (signal2) { val = bitsToFp(fpToBits(op2) | qnan, junk); } else if (nan1) { val = op1; } else if (nan2) { val = op2; } } else if (fpClass == FP_SUBNORMAL && flush) { // Turn val into a zero with the correct sign; uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1); val = bitsToFp(fpToBits(val) & bitMask, junk); feclearexcept(FeInexact); feraiseexcept(FeUnderflow); } return val; } template float fixDest(bool flush, bool defaultNan, float val, float op1, float op2); template double fixDest(bool flush, bool defaultNan, double val, double op1, double op2); template fpType fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2) { fpType mid = fixDest(flush, defaultNan, val, op1, op2); const bool single = (sizeof(fpType) == sizeof(float)); const fpType junk = 0.0; if ((single && (val == bitsToFp(0x00800000, junk) || val == bitsToFp(0x80800000, junk))) || (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) || val == bitsToFp(ULL(0x8010000000000000), junk))) ) { __asm__ __volatile__("" : "=m" (op1) : "m" (op1)); fesetround(FeRoundZero); fpType temp = 0.0; __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); temp = op1 / op2; if (flushToZero(temp)) { feraiseexcept(FeUnderflow); if (flush) { feclearexcept(FeInexact); mid = temp; } } __asm__ __volatile__("" :: "m" (temp)); } return mid; } template float fixDivDest(bool flush, bool defaultNan, float val, float op1, float op2); template double fixDivDest(bool flush, bool defaultNan, double val, double op1, double op2); float fixFpDFpSDest(FPSCR fpscr, double val) { const float junk = 0.0; float op1 = 0.0; if (std::isnan(val)) { uint64_t valBits = fpToBits(val); uint32_t op1Bits = bits(valBits, 50, 29) | (mask(9) << 22) | (bits(valBits, 63) << 31); op1 = bitsToFp(op1Bits, junk); } float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1); if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) == (FeUnderflow | FeInexact)) { feclearexcept(FeInexact); } if (mid == bitsToFp(0x00800000, junk) || mid == bitsToFp(0x80800000, junk)) { __asm__ __volatile__("" : "=m" (val) : "m" (val)); fesetround(FeRoundZero); float temp = 0.0; __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); temp = val; if (flushToZero(temp)) { feraiseexcept(FeUnderflow); if (fpscr.fz) { feclearexcept(FeInexact); mid = temp; } } __asm__ __volatile__("" :: "m" (temp)); } return mid; } double fixFpSFpDDest(FPSCR fpscr, float val) { const double junk = 0.0; double op1 = 0.0; if (std::isnan(val)) { uint32_t valBits = fpToBits(val); uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) | (mask(12) << 51) | ((uint64_t)bits(valBits, 31) << 63); op1 = bitsToFp(op1Bits, junk); } double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1); if (mid == bitsToFp(ULL(0x0010000000000000), junk) || mid == bitsToFp(ULL(0x8010000000000000), junk)) { __asm__ __volatile__("" : "=m" (val) : "m" (val)); fesetround(FeRoundZero); double temp = 0.0; __asm__ __volatile__("" : "=m" (temp) : "m" (temp)); temp = val; if (flushToZero(temp)) { feraiseexcept(FeUnderflow); if (fpscr.fz) { feclearexcept(FeInexact); mid = temp; } } __asm__ __volatile__("" :: "m" (temp)); } return mid; } static inline uint16_t vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan, uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble) { uint32_t mWidth; uint32_t eWidth; uint32_t eHalfRange; uint32_t sBitPos; if (isDouble) { mWidth = 52; eWidth = 11; } else { mWidth = 23; eWidth = 8; } sBitPos = eWidth + mWidth; eHalfRange = (1 << (eWidth-1)) - 1; // Extract the operand. bool neg = bits(opBits, sBitPos); uint32_t exponent = bits(opBits, sBitPos-1, mWidth); uint64_t oldMantissa = bits(opBits, mWidth-1, 0); uint32_t mantissa = oldMantissa >> (mWidth - 10); // Do the conversion. uint64_t extra = oldMantissa & mask(mWidth - 10); if (exponent == mask(eWidth)) { if (oldMantissa != 0) { // Nans. if (bits(mantissa, 9) == 0) { // Signalling nan. fpscr.ioc = 1; } if (ahp) { mantissa = 0; exponent = 0; fpscr.ioc = 1; } else if (defaultNan) { mantissa = (1 << 9); exponent = 0x1f; neg = false; } else { exponent = 0x1f; mantissa |= (1 << 9); } } else { // Infinities. exponent = 0x1F; if (ahp) { fpscr.ioc = 1; mantissa = 0x3ff; } else { mantissa = 0; } } } else if (exponent == 0 && oldMantissa == 0) { // Zero, don't need to do anything. } else { // Normalized or denormalized numbers. bool inexact = (extra != 0); if (exponent == 0) { // Denormalized. // If flush to zero is on, this shouldn't happen. assert(!flush); // Check for underflow if (inexact || fpscr.ufe) fpscr.ufc = 1; // Handle rounding. unsigned mode = rMode; if ((mode == VfpRoundUpward && !neg && extra) || (mode == VfpRoundDown && neg && extra) || (mode == VfpRoundNearest && (extra > (1 << 9) || (extra == (1 << 9) && bits(mantissa, 0))))) { mantissa++; } // See if the number became normalized after rounding. if (mantissa == (1 << 10)) { mantissa = 0; exponent = 1; } } else { // Normalized. // We need to track the dropped bits differently since // more can be dropped by denormalizing. bool topOne = bits(extra, mWidth - 10 - 1); bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0; if (exponent <= (eHalfRange - 15)) { // The result is too small. Denormalize. mantissa |= (1 << 10); while (mantissa && exponent <= (eHalfRange - 15)) { restZeros = restZeros && !topOne; topOne = bits(mantissa, 0); mantissa = mantissa >> 1; exponent++; } if (topOne || !restZeros) inexact = true; exponent = 0; } else { // Change bias. exponent -= (eHalfRange - 15); } if (exponent == 0 && (inexact || fpscr.ufe)) { // Underflow fpscr.ufc = 1; } // Handle rounding. unsigned mode = rMode; bool nonZero = topOne || !restZeros; if ((mode == VfpRoundUpward && !neg && nonZero) || (mode == VfpRoundDown && neg && nonZero) || (mode == VfpRoundNearest && topOne && (!restZeros || bits(mantissa, 0)))) { mantissa++; } // See if we rounded up and need to bump the exponent. if (mantissa == (1 << 10)) { mantissa = 0; exponent++; } // Deal with overflow if (ahp) { if (exponent >= 0x20) { exponent = 0x1f; mantissa = 0x3ff; fpscr.ioc = 1; // Supress inexact exception. inexact = false; } } else { if (exponent >= 0x1f) { if ((mode == VfpRoundNearest) || (mode == VfpRoundUpward && !neg) || (mode == VfpRoundDown && neg)) { // Overflow to infinity. exponent = 0x1f; mantissa = 0; } else { // Overflow to max normal. exponent = 0x1e; mantissa = 0x3ff; } fpscr.ofc = 1; inexact = true; } } } if (inexact) { fpscr.ixc = 1; } } // Reassemble and install the result. uint32_t result = bits(mantissa, 9, 0); replaceBits(result, 14, 10, exponent); if (neg) result |= (1 << 15); return result; } uint16_t vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan, uint32_t rMode, bool ahp, float op) { uint64_t opBits = fpToBits(op); return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false); } uint16_t vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan, uint32_t rMode, bool ahp, double op) { uint64_t opBits = fpToBits(op); return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true); } static inline uint64_t vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble) { uint32_t mWidth; uint32_t eWidth; uint32_t eHalfRange; uint32_t sBitPos; if (isDouble) { mWidth = 52; eWidth = 11; } else { mWidth = 23; eWidth = 8; } sBitPos = eWidth + mWidth; eHalfRange = (1 << (eWidth-1)) - 1; // Extract the bitfields. bool neg = bits(op, 15); uint32_t exponent = bits(op, 14, 10); uint64_t mantissa = bits(op, 9, 0); // Do the conversion. if (exponent == 0) { if (mantissa != 0) { // Normalize the value. exponent = exponent + (eHalfRange - 15) + 1; while (mantissa < (1 << 10)) { mantissa = mantissa << 1; exponent--; } } mantissa = mantissa << (mWidth - 10); } else if (exponent == 0x1f && !ahp) { // Infinities and nans. exponent = mask(eWidth); if (mantissa != 0) { // Nans. mantissa = mantissa << (mWidth - 10); if (bits(mantissa, mWidth-1) == 0) { // Signalling nan. fpscr.ioc = 1; mantissa |= (((uint64_t) 1) << (mWidth-1)); } if (defaultNan) { mantissa &= ~mask(mWidth-1); neg = false; } } } else { exponent = exponent + (eHalfRange - 15); mantissa = mantissa << (mWidth - 10); } // Reassemble the result. uint64_t result = bits(mantissa, mWidth-1, 0); replaceBits(result, sBitPos-1, mWidth, exponent); if (neg) { result |= (((uint64_t) 1) << sBitPos); } return result; } double vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) { double junk = 0.0; uint64_t result; result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true); return bitsToFp(result, junk); } float vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op) { float junk = 0.0; uint64_t result; result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false); return bitsToFp(result, junk); } float vfpUFixedToFpS(bool flush, bool defaultNan, uint64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); if (width == 16) val = (uint16_t)val; else if (width == 32) val = (uint32_t)val; else if (width != 64) panic("Unsupported width %d", width); float scale = powf(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } float vfpSFixedToFpS(bool flush, bool defaultNan, int64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); if (width == 16) val = sext<16>(val & mask(16)); else if (width == 32) val = sext<32>(val & mask(32)); else if (width != 64) panic("Unsupported width %d", width); float scale = powf(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); return fixDivDest(flush, defaultNan, val / scale, (float)val, scale); } double vfpUFixedToFpD(bool flush, bool defaultNan, uint64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); if (width == 16) val = (uint16_t)val; else if (width == 32) val = (uint32_t)val; else if (width != 64) panic("Unsupported width %d", width); double scale = pow(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); } double vfpSFixedToFpD(bool flush, bool defaultNan, int64_t val, uint8_t width, uint8_t imm) { fesetround(FeRoundNearest); if (width == 16) val = sext<16>(val & mask(16)); else if (width == 32) val = sext<32>(val & mask(32)); else if (width != 64) panic("Unsupported width %d", width); double scale = pow(2.0, imm); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); feclearexcept(FeAllExceptions); __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); return fixDivDest(flush, defaultNan, val / scale, (double)val, scale); } // This function implements a magic formula taken from the architecture // reference manual. It was originally called recip_sqrt_estimate. static double recipSqrtEstimate(double a) { int64_t q0, q1, s; double r; if (a < 0.5) { q0 = (int64_t)(a * 512.0); r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); } else { q1 = (int64_t)(a * 256.0); r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); } s = (int64_t)(256.0 * r + 0.5); return (double)s / 256.0; } // This function is only intended for use in Neon instructions because // it ignores certain bits in the FPSCR. float fprSqrtEstimate(FPSCR &fpscr, float op) { const uint32_t qnan = 0x7fc00000; float junk = 0.0; int fpClass = std::fpclassify(op); if (fpClass == FP_NAN) { if ((fpToBits(op) & qnan) != qnan) fpscr.ioc = 1; return bitsToFp(qnan, junk); } else if (fpClass == FP_ZERO) { fpscr.dzc = 1; // Return infinity with the same sign as the operand. return bitsToFp((std::signbit(op) << 31) | (0xFF << 23) | (0 << 0), junk); } else if (std::signbit(op)) { // Set invalid op bit. fpscr.ioc = 1; return bitsToFp(qnan, junk); } else if (fpClass == FP_INFINITE) { return 0.0; } else { uint64_t opBits = fpToBits(op); double scaled; if (bits(opBits, 23)) { scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63), (double)0.0); } else { scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63), (double)0.0); } uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2; uint64_t estimate = fpToBits(recipSqrtEstimate(scaled)); return bitsToFp((bits(estimate, 63) << 31) | (bits(resultExp, 7, 0) << 23) | (bits(estimate, 51, 29) << 0), junk); } } uint32_t unsignedRSqrtEstimate(uint32_t op) { if (bits(op, 31, 30) == 0) { return -1; } else { double dpOp; if (bits(op, 31)) { dpOp = bitsToFp((ULL(0) << 63) | (ULL(0x3fe) << 52) | (bits((uint64_t)op, 30, 0) << 21) | (0 << 0), (double)0.0); } else { dpOp = bitsToFp((ULL(0) << 63) | (ULL(0x3fd) << 52) | (bits((uint64_t)op, 29, 0) << 22) | (0 << 0), (double)0.0); } uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp)); return (1 << 31) | bits(estimate, 51, 21); } } // This function implements a magic formula taken from the architecture // reference manual. It was originally called recip_estimate. static double recipEstimate(double a) { int64_t q, s; double r; q = (int64_t)(a * 512.0); r = 1.0 / (((double)q + 0.5) / 512.0); s = (int64_t)(256.0 * r + 0.5); return (double)s / 256.0; } // This function is only intended for use in Neon instructions because // it ignores certain bits in the FPSCR. float fpRecipEstimate(FPSCR &fpscr, float op) { const uint32_t qnan = 0x7fc00000; float junk = 0.0; int fpClass = std::fpclassify(op); if (fpClass == FP_NAN) { if ((fpToBits(op) & qnan) != qnan) fpscr.ioc = 1; return bitsToFp(qnan, junk); } else if (fpClass == FP_INFINITE) { return bitsToFp(std::signbit(op) << 31, junk); } else if (fpClass == FP_ZERO) { fpscr.dzc = 1; // Return infinity with the same sign as the operand. return bitsToFp((std::signbit(op) << 31) | (0xFF << 23) | (0 << 0), junk); } else if (fabs(op) >= pow(2.0, 126)) { fpscr.ufc = 1; return bitsToFp(std::signbit(op) << 31, junk); } else { uint64_t opBits = fpToBits(op); double scaled; scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) | (ULL(0x3fe) << 52) | (ULL(0) << 63), (double)0.0); uint64_t resultExp = 253 - bits(opBits, 30, 23); uint64_t estimate = fpToBits(recipEstimate(scaled)); return bitsToFp((bits(opBits, 31) << 31) | (bits(resultExp, 7, 0) << 23) | (bits(estimate, 51, 29) << 0), junk); } } uint32_t unsignedRecipEstimate(uint32_t op) { if (bits(op, 31) == 0) { return -1; } else { double dpOp; dpOp = bitsToFp((ULL(0) << 63) | (ULL(0x3fe) << 52) | (bits((uint64_t)op, 30, 0) << 21) | (0 << 0), (double)0.0); uint64_t estimate = fpToBits(recipEstimate(dpOp)); return (1 << 31) | bits(estimate, 51, 21); } } template fpType FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, fpType op1, fpType op2) const { done = true; fpType junk = 0.0; fpType dest = 0.0; const bool single = (sizeof(fpType) == sizeof(float)); const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan1 = std::isnan(op1); const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); if (nan1 || nan2) { if (defaultNan) { dest = bitsToFp(qnan, junk); } else if (signal1) { dest = bitsToFp(fpToBits(op1) | qnan, junk); } else if (signal2) { dest = bitsToFp(fpToBits(op2) | qnan, junk); } else if (nan1) { dest = op1; } else if (nan2) { dest = op2; } if (signal1 || signal2) { fpscr.ioc = 1; } } else { done = false; } return dest; } template float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, float op1, float op2) const; template double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan, double op1, double op2) const; // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB template fpType FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3, fpType (*func)(fpType, fpType, fpType), bool flush, bool defaultNan, uint32_t rMode) const { const bool single = (sizeof(fpType) == sizeof(float)); fpType junk = 0.0; if (flush && (flushToZero(op1, op2) || flushToZero(op3))) fpscr.idc = 1; VfpSavedState state = prepFpState(rMode); __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state) : "m" (op1), "m" (op2), "m" (op3), "m" (state)); fpType dest = func(op1, op2, op3); __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); int fpClass = std::fpclassify(dest); // Get NAN behavior right. This varies between x86 and ARM. if (fpClass == FP_NAN) { const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan1 = std::isnan(op1); const bool nan2 = std::isnan(op2); const bool nan3 = std::isnan(op3); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan); if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) { dest = bitsToFp(qnan, junk); } else if (signal1) { dest = bitsToFp(fpToBits(op1) | qnan, junk); } else if (signal2) { dest = bitsToFp(fpToBits(op2) | qnan, junk); } else if (signal3) { dest = bitsToFp(fpToBits(op3) | qnan, junk); } else if (nan1) { dest = op1; } else if (nan2) { dest = op2; } else if (nan3) { dest = op3; } } else if (flush && flushToZero(dest)) { feraiseexcept(FeUnderflow); } else if (( (single && (dest == bitsToFp(0x00800000, junk) || dest == bitsToFp(0x80800000, junk))) || (!single && (dest == bitsToFp(ULL(0x0010000000000000), junk) || dest == bitsToFp(ULL(0x8010000000000000), junk))) ) && rMode != VfpRoundZero) { /* * Correct for the fact that underflow is detected -before- rounding * in ARM and -after- rounding in x86. */ fesetround(FeRoundZero); __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3) : "m" (op1), "m" (op2), "m" (op3)); fpType temp = func(op1, op2, op2); __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); if (flush && flushToZero(temp)) { dest = temp; } } finishVfp(fpscr, state, flush); return dest; } template float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3, float (*func)(float, float, float), bool flush, bool defaultNan, uint32_t rMode) const; template double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3, double (*func)(double, double, double), bool flush, bool defaultNan, uint32_t rMode) const; template fpType FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType (*func)(fpType, fpType), bool flush, bool defaultNan, uint32_t rMode) const { const bool single = (sizeof(fpType) == sizeof(float)); fpType junk = 0.0; if (flush && flushToZero(op1, op2)) fpscr.idc = 1; VfpSavedState state = prepFpState(rMode); __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state) : "m" (op1), "m" (op2), "m" (state)); fpType dest = func(op1, op2); __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); // Get NAN behavior right. This varies between x86 and ARM. if (std::isnan(dest)) { const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan1 = std::isnan(op1); const bool nan2 = std::isnan(op2); const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan); const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan); if ((!nan1 && !nan2) || (defaultNan == 1)) { dest = bitsToFp(qnan, junk); } else if (signal1) { dest = bitsToFp(fpToBits(op1) | qnan, junk); } else if (signal2) { dest = bitsToFp(fpToBits(op2) | qnan, junk); } else if (nan1) { dest = op1; } else if (nan2) { dest = op2; } } else if (flush && flushToZero(dest)) { feraiseexcept(FeUnderflow); } else if (( (single && (dest == bitsToFp(0x00800000, junk) || dest == bitsToFp(0x80800000, junk))) || (!single && (dest == bitsToFp(ULL(0x0010000000000000), junk) || dest == bitsToFp(ULL(0x8010000000000000), junk))) ) && rMode != VfpRoundZero) { /* * Correct for the fact that underflow is detected -before- rounding * in ARM and -after- rounding in x86. */ fesetround(FeRoundZero); __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2) : "m" (op1), "m" (op2)); fpType temp = func(op1, op2); __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); if (flush && flushToZero(temp)) { dest = temp; } } finishVfp(fpscr, state, flush); return dest; } template float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2, float (*func)(float, float), bool flush, bool defaultNan, uint32_t rMode) const; template double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2, double (*func)(double, double), bool flush, bool defaultNan, uint32_t rMode) const; template fpType FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType), bool flush, uint32_t rMode) const { const bool single = (sizeof(fpType) == sizeof(float)); fpType junk = 0.0; if (flush && flushToZero(op1)) fpscr.idc = 1; VfpSavedState state = prepFpState(rMode); __asm__ __volatile__ ("" : "=m" (op1), "=m" (state) : "m" (op1), "m" (state)); fpType dest = func(op1); __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest)); // Get NAN behavior right. This varies between x86 and ARM. if (std::isnan(dest)) { const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000); const bool nan = std::isnan(op1); if (!nan || fpscr.dn == 1) { dest = bitsToFp(qnan, junk); } else if (nan) { dest = bitsToFp(fpToBits(op1) | qnan, junk); } } else if (flush && flushToZero(dest)) { feraiseexcept(FeUnderflow); } else if (( (single && (dest == bitsToFp(0x00800000, junk) || dest == bitsToFp(0x80800000, junk))) || (!single && (dest == bitsToFp(ULL(0x0010000000000000), junk) || dest == bitsToFp(ULL(0x8010000000000000), junk))) ) && rMode != VfpRoundZero) { /* * Correct for the fact that underflow is detected -before- rounding * in ARM and -after- rounding in x86. */ fesetround(FeRoundZero); __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1)); fpType temp = func(op1); __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp)); if (flush && flushToZero(temp)) { dest = temp; } } finishVfp(fpscr, state, flush); return dest; } template float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float), bool flush, uint32_t rMode) const; template double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double), bool flush, uint32_t rMode) const; IntRegIndex VfpMacroOp::addStride(IntRegIndex idx, unsigned stride) { if (wide) { stride *= 2; } unsigned offset = idx % 8; idx = (IntRegIndex)(idx - offset); offset += stride; idx = (IntRegIndex)(idx + (offset % 8)); return idx; } void VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2) { unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; assert(!inScalarBank(dest)); dest = addStride(dest, stride); op1 = addStride(op1, stride); if (!inScalarBank(op2)) { op2 = addStride(op2, stride); } } void VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1) { unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; assert(!inScalarBank(dest)); dest = addStride(dest, stride); if (!inScalarBank(op1)) { op1 = addStride(op1, stride); } } void VfpMacroOp::nextIdxs(IntRegIndex &dest) { unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2; assert(!inScalarBank(dest)); dest = addStride(dest, stride); } }