summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/gpu-compute/compute_unit.cc141
-rw-r--r--src/gpu-compute/compute_unit.hh25
-rw-r--r--src/gpu-compute/wavefront.cc4
3 files changed, 170 insertions, 0 deletions
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index abf8ff2c5..f05ecc1b2 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1408,6 +1408,114 @@ ComputeUnit::regStats()
{
MemObject::regStats();
+ vALUInsts
+ .name(name() + ".valu_insts")
+ .desc("Number of vector ALU insts issued.")
+ ;
+ vALUInstsPerWF
+ .name(name() + ".valu_insts_per_wf")
+ .desc("The avg. number of vector ALU insts issued per-wavefront.")
+ ;
+ sALUInsts
+ .name(name() + ".salu_insts")
+ .desc("Number of scalar ALU insts issued.")
+ ;
+ sALUInstsPerWF
+ .name(name() + ".salu_insts_per_wf")
+ .desc("The avg. number of scalar ALU insts issued per-wavefront.")
+ ;
+ instCyclesVALU
+ .name(name() + ".inst_cycles_valu")
+ .desc("Number of cycles needed to execute VALU insts.")
+ ;
+ instCyclesSALU
+ .name(name() + ".inst_cycles_salu")
+ .desc("Number of cycles needed to execute SALU insts.")
+ ;
+ threadCyclesVALU
+ .name(name() + ".thread_cycles_valu")
+ .desc("Number of thread cycles used to execute vector ALU ops. "
+ "Similar to instCyclesVALU but multiplied by the number of "
+ "active threads.")
+ ;
+ vALUUtilization
+ .name(name() + ".valu_utilization")
+ .desc("Percentage of active vector ALU threads in a wave.")
+ ;
+ ldsNoFlatInsts
+ .name(name() + ".lds_no_flat_insts")
+ .desc("Number of LDS insts issued, not including FLAT "
+ "accesses that resolve to LDS.")
+ ;
+ ldsNoFlatInstsPerWF
+ .name(name() + ".lds_no_flat_insts_per_wf")
+ .desc("The avg. number of LDS insts (not including FLAT "
+ "accesses that resolve to LDS) per-wavefront.")
+ ;
+ flatVMemInsts
+ .name(name() + ".flat_vmem_insts")
+ .desc("The number of FLAT insts that resolve to vmem issued.")
+ ;
+ flatVMemInstsPerWF
+ .name(name() + ".flat_vmem_insts_per_wf")
+ .desc("The average number of FLAT insts that resolve to vmem "
+ "issued per-wavefront.")
+ ;
+ flatLDSInsts
+ .name(name() + ".flat_lds_insts")
+ .desc("The number of FLAT insts that resolve to LDS issued.")
+ ;
+ flatLDSInstsPerWF
+ .name(name() + ".flat_lds_insts_per_wf")
+ .desc("The average number of FLAT insts that resolve to LDS "
+ "issued per-wavefront.")
+ ;
+ vectorMemWrites
+ .name(name() + ".vector_mem_writes")
+ .desc("Number of vector mem write insts (excluding FLAT insts).")
+ ;
+ vectorMemWritesPerWF
+ .name(name() + ".vector_mem_writes_per_wf")
+ .desc("The average number of vector mem write insts "
+ "(excluding FLAT insts) per-wavefront.")
+ ;
+ vectorMemReads
+ .name(name() + ".vector_mem_reads")
+ .desc("Number of vector mem read insts (excluding FLAT insts).")
+ ;
+ vectorMemReadsPerWF
+ .name(name() + ".vector_mem_reads_per_wf")
+ .desc("The avg. number of vector mem read insts (excluding "
+ "FLAT insts) per-wavefront.")
+ ;
+ scalarMemWrites
+ .name(name() + ".scalar_mem_writes")
+ .desc("Number of scalar mem write insts.")
+ ;
+ scalarMemWritesPerWF
+ .name(name() + ".scalar_mem_writes_per_wf")
+ .desc("The average number of scalar mem write insts per-wavefront.")
+ ;
+ scalarMemReads
+ .name(name() + ".scalar_mem_reads")
+ .desc("Number of scalar mem read insts.")
+ ;
+ scalarMemReadsPerWF
+ .name(name() + ".scalar_mem_reads_per_wf")
+ .desc("The average number of scalar mem read insts per-wavefront.")
+ ;
+
+ vALUInstsPerWF = vALUInsts / completedWfs;
+ sALUInstsPerWF = sALUInsts / completedWfs;
+ vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
+ ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
+ flatVMemInstsPerWF = flatVMemInsts / completedWfs;
+ flatLDSInstsPerWF = flatLDSInsts / completedWfs;
+ vectorMemWritesPerWF = vectorMemWrites / completedWfs;
+ vectorMemReadsPerWF = vectorMemReads / completedWfs;
+ scalarMemWritesPerWF = scalarMemWrites / completedWfs;
+ scalarMemReadsPerWF = scalarMemReads / completedWfs;
+
tlbCycles
.name(name() + ".tlb_cycles")
.desc("total number of cycles for all uncoalesced requests")
@@ -1567,6 +1675,39 @@ ComputeUnit::regStats()
}
void
+ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
+{
+ if (gpuDynInst->isScalar()) {
+ if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
+ sALUInsts++;
+ instCyclesSALU++;
+ } else if (gpuDynInst->isLoad()) {
+ scalarMemReads++;
+ } else if (gpuDynInst->isStore()) {
+ scalarMemWrites++;
+ }
+ } else {
+ if (gpuDynInst->isALU()) {
+ vALUInsts++;
+ instCyclesVALU++;
+ threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
+ } else if (gpuDynInst->isFlat()) {
+ if (gpuDynInst->isLocalMem()) {
+ flatLDSInsts++;
+ } else {
+ flatVMemInsts++;
+ }
+ } else if (gpuDynInst->isLocalMem()) {
+ ldsNoFlatInsts++;
+ } else if (gpuDynInst->isLoad()) {
+ vectorMemReads++;
+ } else if (gpuDynInst->isStore()) {
+ vectorMemWrites++;
+ }
+ }
+}
+
+void
ComputeUnit::updatePageDivergenceDist(Addr addr)
{
Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 938658fd1..2187bec38 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -301,6 +301,31 @@ class ComputeUnit : public MemObject
LdsState &lds;
public:
+ Stats::Scalar vALUInsts;
+ Stats::Formula vALUInstsPerWF;
+ Stats::Scalar sALUInsts;
+ Stats::Formula sALUInstsPerWF;
+ Stats::Scalar instCyclesVALU;
+ Stats::Scalar instCyclesSALU;
+ Stats::Scalar threadCyclesVALU;
+ Stats::Formula vALUUtilization;
+ Stats::Scalar ldsNoFlatInsts;
+ Stats::Formula ldsNoFlatInstsPerWF;
+ Stats::Scalar flatVMemInsts;
+ Stats::Formula flatVMemInstsPerWF;
+ Stats::Scalar flatLDSInsts;
+ Stats::Formula flatLDSInstsPerWF;
+ Stats::Scalar vectorMemWrites;
+ Stats::Formula vectorMemWritesPerWF;
+ Stats::Scalar vectorMemReads;
+ Stats::Formula vectorMemReadsPerWF;
+ Stats::Scalar scalarMemWrites;
+ Stats::Formula scalarMemWritesPerWF;
+ Stats::Scalar scalarMemReads;
+ Stats::Formula scalarMemReadsPerWF;
+
+ void updateInstStats(GPUDynInstPtr gpuDynInst);
+
// the following stats compute the avg. TLB accesslatency per
// uncoalesced request (only for data)
Stats::Scalar tlbRequests;
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 96f0d0e96..99ac24900 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -656,7 +656,11 @@ Wavefront::exec()
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble(), old_pc);
+
+ // update the instruction stats in the CU
+
ii->execute(ii);
+ computeUnit->updateInstStats(ii);
// access the VRF
computeUnit->vrf[simdId]->exec(ii, this);
srcRegOpDist.sample(ii->numSrcRegOperands());