ARM: Implement conversion to/from half precision.

author: Gabe Black <gblack@eecs.umich.edu> 2010-06-02 12:58:16 -0500
committer: Gabe Black <gblack@eecs.umich.edu> 2010-06-02 12:58:16 -0500
commit: 237c0617a0c095e35169c3f4e48e93eaf4ada527 (patch)
tree: b4b3c805611a04dc8bcdc923e0133c071374a4b2 /src
parent: 04e196f4223b5dfd61782edaaac27166a2bfcf3c (diff)
download: gem5-237c0617a0c095e35169c3f4e48e93eaf4ada527.tar.xz
3 files changed, 303 insertions, 2 deletions
diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh
index 259bf9c11..37553a5dc 100644
--- a/src/arch/arm/insts/vfp.hh
+++ b/src/arch/arm/insts/vfp.hh
@@ -396,6 +396,223 @@ fixFpSFpDDest(FPSCR fpscr, float val)
     return mid;
 }
 
+static inline float
+vcvtFpSFpH(FPSCR &fpscr, float op, float dest, bool top)
+{
+    float junk = 0.0;
+    uint32_t destBits = fpToBits(dest);
+    uint32_t opBits = fpToBits(op);
+    // Extract the operand.
+    bool neg = bits(opBits, 31);
+    uint32_t exponent = bits(opBits, 30, 23);
+    uint32_t oldMantissa = bits(opBits, 22, 0);
+    uint32_t mantissa = oldMantissa >> (23 - 10);
+    // Do the conversion.
+    uint32_t extra = oldMantissa & mask(23 - 10);
+    if (exponent == 0xff) {
+        if (oldMantissa != 0) {
+            // Nans.
+            if (bits(mantissa, 9) == 0) {
+                // Signalling nan.
+                fpscr.ioc = 1;
+            }
+            if (fpscr.ahp) {
+                mantissa = 0;
+                exponent = 0;
+                fpscr.ioc = 1;
+            } else if (fpscr.dn) {
+                mantissa = (1 << 9);
+                exponent = 0x1f;
+                neg = false;
+            } else {
+                exponent = 0x1f;
+                mantissa |= (1 << 9);
+            }
+        } else {
+            // Infinities.
+            exponent = 0x1F;
+            if (fpscr.ahp) {
+                fpscr.ioc = 1;
+                mantissa = 0x3ff;
+            } else {
+                mantissa = 0;
+            }
+        }
+    } else if (exponent == 0 && oldMantissa == 0) {
+        // Zero, don't need to do anything.
+    } else {
+        // Normalized or denormalized numbers.
+
+        bool inexact = (extra != 0);
+
+        if (exponent == 0) {
+            // Denormalized.
+
+            // If flush to zero is on, this shouldn't happen.
+            assert(fpscr.fz == 0);
+
+            // Check for underflow
+            if (inexact || fpscr.ufe)
+                fpscr.ufc = 1;
+
+            // Handle rounding.
+            unsigned mode = fpscr.rMode;
+            if ((mode == VfpRoundUpward && !neg && extra) ||
+                (mode == VfpRoundDown && neg && extra) ||
+                (mode == VfpRoundNearest &&
+                 (extra > (1 << 9) ||
+                  (extra == (1 << 9) && bits(mantissa, 0))))) {
+                mantissa++;
+            }
+
+            // See if the number became normalized after rounding.
+            if (mantissa == (1 << 10)) {
+                mantissa = 0;
+                exponent = 1;
+            }
+        } else {
+            // Normalized.
+
+            // We need to track the dropped bits differently since
+            // more can be dropped by denormalizing.
+            bool topOne = bits(extra, 12);
+            bool restZeros = bits(extra, 11, 0) == 0;
+
+            if (exponent <= (127 - 15)) {
+                // The result is too small. Denormalize.
+                mantissa |= (1 << 10);
+                while (mantissa && exponent <= (127 - 15)) {
+                    restZeros = restZeros && !topOne;
+                    topOne = bits(mantissa, 0);
+                    mantissa = mantissa >> 1;
+                    exponent++;
+                }
+                if (topOne || !restZeros)
+                    inexact = true;
+                exponent = 0;
+            } else {
+                // Change bias.
+                exponent -= (127 - 15);
+            }
+
+            if (exponent == 0 && (inexact || fpscr.ufe)) {
+                // Underflow
+                fpscr.ufc = 1;
+            }
+
+            // Handle rounding.
+            unsigned mode = fpscr.rMode;
+            bool nonZero = topOne || !restZeros;
+            if ((mode == VfpRoundUpward && !neg && nonZero) ||
+                (mode == VfpRoundDown && neg && nonZero) ||
+                (mode == VfpRoundNearest && topOne &&
+                 (!restZeros || bits(mantissa, 0)))) {
+                mantissa++;
+            }
+
+            // See if we rounded up and need to bump the exponent.
+            if (mantissa == (1 << 10)) {
+                mantissa = 0;
+                exponent++;
+            }
+
+            // Deal with overflow
+            if (fpscr.ahp) {
+                if (exponent >= 0x20) {
+                    exponent = 0x1f;
+                    mantissa = 0x3ff;
+                    fpscr.ioc = 1;
+                    // Supress inexact exception.
+                    inexact = false;
+                }
+            } else {
+                if (exponent >= 0x1f) {
+                    if ((mode == VfpRoundNearest) ||
+                        (mode == VfpRoundUpward && !neg) ||
+                        (mode == VfpRoundDown && neg)) {
+                        // Overflow to infinity.
+                        exponent = 0x1f;
+                        mantissa = 0;
+                    } else {
+                        // Overflow to max normal.
+                        exponent = 0x1e;
+                        mantissa = 0x3ff;
+                    }
+                    fpscr.ofc = 1;
+                    inexact = true;
+                }
+            }
+        }
+
+        if (inexact) {
+            fpscr.ixc = 1;
+        }
+    }
+    // Reassemble and install the result.
+    uint32_t result = bits(mantissa, 9, 0);
+    replaceBits(result, 14, 10, exponent);
+    if (neg)
+        result |= (1 << 15);
+    if (top)
+        replaceBits(destBits, 31, 16, result);
+    else
+        replaceBits(destBits, 15, 0, result);
+    return bitsToFp(destBits, junk);
+}
+
+static inline float
+vcvtFpHFpS(FPSCR &fpscr, float op, bool top)
+{
+    float junk = 0.0;
+    uint32_t opBits = fpToBits(op);
+    // Extract the operand.
+    if (top)
+        opBits = bits(opBits, 31, 16);
+    else
+        opBits = bits(opBits, 15, 0);
+    // Extract the bitfields.
+    bool neg = bits(opBits, 15);
+    uint32_t exponent = bits(opBits, 14, 10);
+    uint32_t mantissa = bits(opBits, 9, 0);
+    // Do the conversion.
+    if (exponent == 0) {
+        if (mantissa != 0) {
+            // Normalize the value.
+            exponent = exponent + (127 - 15) + 1;
+            while (mantissa < (1 << 10)) {
+                mantissa = mantissa << 1;
+                exponent--;
+            }
+        }
+        mantissa = mantissa << (23 - 10);
+    } else if (exponent == 0x1f && !fpscr.ahp) {
+        // Infinities and nans.
+        exponent = 0xff;
+        if (mantissa != 0) {
+            // Nans.
+            mantissa = mantissa << (23 - 10);
+            if (bits(mantissa, 22) == 0) {
+                // Signalling nan.
+                fpscr.ioc = 1;
+                mantissa |= (1 << 22);
+            }
+            if (fpscr.dn) {
+                mantissa &= ~mask(22);
+                neg = false;
+            }
+        }
+    } else {
+        exponent = exponent + (127 - 15);
+        mantissa = mantissa << (23 - 10);
+    }
+    // Reassemble the result.
+    uint32_t result = bits(mantissa, 22, 0);
+    replaceBits(result, 30, 23, exponent);
+    if (neg)
+        result |= (1 << 31);
+    return bitsToFp(result, junk);
+}
+
 static inline double
 makeDouble(uint32_t low, uint32_t high)
 {
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index d509fc28a..03e574648 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -655,8 +655,23 @@ let {{
                 }
               case 0x2:
               case 0x3:
-                // Between half and single precision.
-                return new WarnUnimplemented("vcvtb, vcvtt", machInst);
+                {
+                    const bool toHalf = bits(machInst, 16);
+                    const bool top = bits(machInst, 7);
+                    if (top) {
+                        if (toHalf) {
+                            return new VcvtFpSFpHT(machInst, vd, vm);
+                        } else {
+                            return new VcvtFpHTFpS(machInst, vd, vm);
+                        }
+                    } else {
+                        if (toHalf) {
+                            return new VcvtFpSFpHB(machInst, vd, vm);
+                        } else {
+                            return new VcvtFpHBFpS(machInst, vd, vm);
+                        }
+                    }
+                }
               case 0x4:
                 if (single) {
                     if (e) {
diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa
index bee63d671..c4682b66c 100644
--- a/src/arch/arm/isa/insts/fp.isa
+++ b/src/arch/arm/isa/insts/fp.isa
@@ -912,6 +912,75 @@ let {{
     decoder_output += FpRegRegOpConstructor.subst(vcvtFpDFpSIop);
     exec_output += PredOpExecute.subst(vcvtFpDFpSIop);
 
+    vcvtFpHTFpSCode = '''
+        FPSCR fpscr = Fpscr;
+        vfpFlushToZero(fpscr, FpOp1);
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
+        FpDest = vcvtFpHFpS(fpscr, FpOp1, true);
+        __asm__ __volatile__("" :: "m" (FpDest));
+        finishVfp(fpscr, state);
+        Fpscr = fpscr;
+    '''
+    vcvtFpHTFpSIop = InstObjParams("vcvtt", "VcvtFpHTFpS", "FpRegRegOp",
+                                   { "code": vcvtFpHTFpSCode,
+                                     "predicate_test": predicateTest }, [])
+    header_output += FpRegRegOpDeclare.subst(vcvtFpHTFpSIop);
+    decoder_output += FpRegRegOpConstructor.subst(vcvtFpHTFpSIop);
+    exec_output += PredOpExecute.subst(vcvtFpHTFpSIop);
+
+    vcvtFpHBFpSCode = '''
+        FPSCR fpscr = Fpscr;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
+        FpDest = vcvtFpHFpS(fpscr, FpOp1, false);
+        __asm__ __volatile__("" :: "m" (FpDest));
+        finishVfp(fpscr, state);
+        Fpscr = fpscr;
+    '''
+    vcvtFpHBFpSIop = InstObjParams("vcvtb", "VcvtFpHBFpS", "FpRegRegOp",
+                                   { "code": vcvtFpHBFpSCode,
+                                     "predicate_test": predicateTest }, [])
+    header_output += FpRegRegOpDeclare.subst(vcvtFpHBFpSIop);
+    decoder_output += FpRegRegOpConstructor.subst(vcvtFpHBFpSIop);
+    exec_output += PredOpExecute.subst(vcvtFpHBFpSIop);
+
+    vcvtFpSFpHTCode = '''
+        FPSCR fpscr = Fpscr;
+        vfpFlushToZero(fpscr, FpOp1);
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest)
+                                : "m" (FpOp1), "m" (FpDest));
+        FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, true);
+        __asm__ __volatile__("" :: "m" (FpDest));
+        finishVfp(fpscr, state);
+        Fpscr = fpscr;
+    '''
+    vcvtFpSFpHTIop = InstObjParams("vcvtt", "VcvtFpSFpHT", "FpRegRegOp",
+                                    { "code": vcvtFpHTFpSCode,
+                                      "predicate_test": predicateTest }, [])
+    header_output += FpRegRegOpDeclare.subst(vcvtFpSFpHTIop);
+    decoder_output += FpRegRegOpConstructor.subst(vcvtFpSFpHTIop);
+    exec_output += PredOpExecute.subst(vcvtFpSFpHTIop);
+
+    vcvtFpSFpHBCode = '''
+        FPSCR fpscr = Fpscr;
+        vfpFlushToZero(fpscr, FpOp1);
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (FpOp1), "=m" (FpDest)
+                                : "m" (FpOp1), "m" (FpDest));
+        FpDest = vcvtFpSFpH(fpscr, FpOp1, FpDest, false);
+        __asm__ __volatile__("" :: "m" (FpDest));
+        finishVfp(fpscr, state);
+        Fpscr = fpscr;
+    '''
+    vcvtFpSFpHBIop = InstObjParams("vcvtb", "VcvtFpSFpHB", "FpRegRegOp",
+                                   { "code": vcvtFpSFpHBCode,
+                                     "predicate_test": predicateTest }, [])
+    header_output += FpRegRegOpDeclare.subst(vcvtFpSFpHBIop);
+    decoder_output += FpRegRegOpConstructor.subst(vcvtFpSFpHBIop);
+    exec_output += PredOpExecute.subst(vcvtFpSFpHBIop);
+
     vcmpSCode = '''
         FPSCR fpscr = Fpscr;
         vfpFlushToZero(fpscr, FpDest, FpOp1);
author	Gabe Black <gblack@eecs.umich.edu>	2010-06-02 12:58:16 -0500
committer	Gabe Black <gblack@eecs.umich.edu>	2010-06-02 12:58:16 -0500
commit	237c0617a0c095e35169c3f4e48e93eaf4ada527 (patch)
tree	b4b3c805611a04dc8bcdc923e0133c071374a4b2 /src
parent	04e196f4223b5dfd61782edaaac27166a2bfcf3c (diff)
download	gem5-237c0617a0c095e35169c3f4e48e93eaf4ada527.tar.xz