gpu: fix bugs with MemFence, Flat Instrs and Resource utilization

Both Memory Fence is now flagged as Global Memory only to avoid resource oversubscribing. Flat instructions now check for Shared Memory resource busy to avoid oversubscribing resources. All WaitClass resources now use cycles (not ticks) to register the number of pipe stages between Scoreboard and Execute to be consistent with instruction scheduling logic which always used clock cycles.
author: John Kalamatianos <john.kalamatianos@amd.com> 2016-02-18 10:42:03 -0500
committer: John Kalamatianos <john.kalamatianos@amd.com> 2016-02-18 10:42:03 -0500
commit: a28a234069a0c38bb75ba051ef82e7b4400f133a (patch)
tree: fc4db3c35cf44a798357e444f757a7661be2ae9d /src
parent: 9a0f1be21fe5fc36d379387134498a76aaf88031 (diff)
download: gem5-a28a234069a0c38bb75ba051ef82e7b4400f133a.tar.xz
3 files changed, 9 insertions, 11 deletions
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
index 126cf6c50..6cd9bfe26 100644
--- a/src/gpu-compute/code_enums.hh
+++ b/src/gpu-compute/code_enums.hh
@@ -84,6 +84,7 @@
                     ||(a)==Enums::OT_PRIVATE_ATOMIC \
                     ||(a)==Enums::OT_SPILL_ATOMIC \
                     ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE \
                     ||(a)==Enums::OT_FLAT_ATOMIC)
 
 #define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
@@ -93,8 +94,7 @@
                     ||(a)==Enums::OT_BOTH_MEMFENCE)
 
 #define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
-                    ||(a)==Enums::OT_SHARED_MEMFENCE \
-                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+                    ||(a)==Enums::OT_SHARED_MEMFENCE)
 
 #define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
 
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index d3622007a..63f3e8fb5 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -587,8 +587,8 @@ void
 ComputeUnit::init()
 {
     // Initialize CU Bus models
-    glbMemToVrfBus.init(&shader->tick_cnt, 1);
-    locMemToVrfBus.init(&shader->tick_cnt, 1);
+    glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
+    locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
     nextGlbMemBus = 0;
     nextLocMemBus = 0;
     fatal_if(numGlbMemUnits > 1,
@@ -596,7 +596,7 @@ ComputeUnit::init()
     vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
     for (int j = 0; j < numGlbMemUnits; ++j) {
         vrfToGlobalMemPipeBus[j] = WaitClass();
-        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
     }
 
     fatal_if(numLocMemUnits > 1,
@@ -604,7 +604,7 @@ ComputeUnit::init()
     vrfToLocalMemPipeBus.resize(numLocMemUnits);
     for (int j = 0; j < numLocMemUnits; ++j) {
         vrfToLocalMemPipeBus[j] = WaitClass();
-        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
     }
     vectorRegsReserved.resize(numSIMDs, 0);
     aluPipe.resize(numSIMDs);
@@ -612,12 +612,12 @@ ComputeUnit::init()
 
     for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
         wfWait[i] = WaitClass();
-        wfWait[i].init(&shader->tick_cnt, 1);
+        wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
     }
 
     for (int i = 0; i < numSIMDs; ++i) {
         aluPipe[i] = WaitClass();
-        aluPipe[i].init(&shader->tick_cnt, 1);
+        aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
     }
 
     // Setup space for call args
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 0aa033db1..ed13b22c7 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -162,7 +162,6 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii)
 
     if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
         IS_OT_ATOMIC_GM(ii->opType())) {
-
         return true;
     }
 
@@ -349,7 +348,7 @@ Wavefront::ready(itype_e type)
     }
     bool locMemBusRdy = false;
     bool locMemIssueRdy = false;
-    if (type == I_SHARED) {
+    if (type == I_SHARED || type == I_FLAT) {
         for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
             if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
                 locMemBusRdy = true;
@@ -598,7 +597,6 @@ Wavefront::ready(itype_e type)
 
     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
             simdId, wfSlotId, ii->disassemble());
-
     return 1;
 }
author	John Kalamatianos <john.kalamatianos@amd.com>	2016-02-18 10:42:03 -0500
committer	John Kalamatianos <john.kalamatianos@amd.com>	2016-02-18 10:42:03 -0500
commit	a28a234069a0c38bb75ba051ef82e7b4400f133a (patch)
tree	fc4db3c35cf44a798357e444f757a7661be2ae9d /src
parent	9a0f1be21fe5fc36d379387134498a76aaf88031 (diff)
download	gem5-a28a234069a0c38bb75ba051ef82e7b4400f133a.tar.xz