x86: implement rcpps and rcpss SSE insts

These are packed single-precision approximate reciprocal operations, vector and scalar versions, respectively. This code was basically developed by copying the code for sqrtps and sqrtss. The mrcp micro-op was simplified relative to msqrt since there are no double-precision versions of this operation.
author: Steve Reinhardt <steve.reinhardt@amd.com> 2015-10-06 17:26:50 -0700
committer: Steve Reinhardt <steve.reinhardt@amd.com> 2015-10-06 17:26:50 -0700
commit: a2c875c746a7b9b5dcb94fd93d94ab70286dbbb4 (patch)
tree: 03ac1c0befec0a164e233b655759efac0f3207c0 /src/arch/x86/isa/microops/mediaop.isa
parent: 57b9f53afa5660152a77b7f3b7affb39f5b0e176 (diff)
download: gem5-a2c875c746a7b9b5dcb94fd93d94ab70286dbbb4.tar.xz
1 files changed, 38 insertions, 1 deletions
diff --git a/src/arch/x86/isa/microops/mediaop.isa b/src/arch/x86/isa/microops/mediaop.isa
index e382151ef..e5f04109f 100644
--- a/src/arch/x86/isa/microops/mediaop.isa
+++ b/src/arch/x86/isa/microops/mediaop.isa
@@ -1,4 +1,6 @@
-/// Copyright (c) 2009 The Regents of The University of Michigan
+// Copyright (c) 2009 The Regents of The University of Michigan
+// Copyright (c) 2015 Advanced Micro Devices, Inc.
+//
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -691,6 +693,41 @@ let {{
             FpDestReg_uqw = result;
         '''
 
+    # compute approximate reciprocal --- single-precision only
+    class Mrcp(MediaOp):
+        def __init__(self, dest, src, \
+                size = None, destSize = None, srcSize = None, ext = None):
+            super(Mrcp, self).__init__(dest, src,\
+                    "InstRegIndex(0)", size, destSize, srcSize, ext)
+        code = '''
+            union floatInt
+            {
+                float f;
+                uint32_t i;
+            };
+
+            assert(srcSize == 4);  // ISA defines single-precision only
+            assert(srcSize == destSize);
+            const int size = 4;
+            const int sizeBits = size * 8;
+            int items = numItems(size);
+            uint64_t result = FpDestReg_uqw;
+
+            for (int i = 0; i < items; i++) {
+                int hiIndex = (i + 1) * sizeBits - 1;
+                int loIndex = (i + 0) * sizeBits;
+                uint64_t argBits = bits(FpSrcReg1_uqw, hiIndex, loIndex);
+
+                floatInt fi;
+                fi.i = argBits;
+                // This is more accuracy than HW provides, but oh well
+                fi.f = 1.0 / fi.f;
+                argBits = fi.i;
+                result = insertBits(result, hiIndex, loIndex, argBits);
+            }
+            FpDestReg_uqw = result;
+        '''
+
     class Maddf(MediaOp):
         code = '''
             union floatInt
author	Steve Reinhardt <steve.reinhardt@amd.com>	2015-10-06 17:26:50 -0700
committer	Steve Reinhardt <steve.reinhardt@amd.com>	2015-10-06 17:26:50 -0700
commit	a2c875c746a7b9b5dcb94fd93d94ab70286dbbb4 (patch)
tree	03ac1c0befec0a164e233b655759efac0f3207c0 /src/arch/x86/isa/microops/mediaop.isa
parent	57b9f53afa5660152a77b7f3b7affb39f5b0e176 (diff)
download	gem5-a2c875c746a7b9b5dcb94fd93d94ab70286dbbb4.tar.xz