diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
commit | 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch) | |
tree | 867510a147cd095f19499d26b7c02d27de4cae9d /src/gpu-compute/compute_unit.hh | |
parent | 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff) | |
download | gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz |
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/gpu-compute/compute_unit.hh')
-rw-r--r-- | src/gpu-compute/compute_unit.hh | 767 |
1 files changed, 767 insertions, 0 deletions
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh new file mode 100644 index 000000000..f47c27a0a --- /dev/null +++ b/src/gpu-compute/compute_unit.hh @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#ifndef __COMPUTE_UNIT_HH__ +#define __COMPUTE_UNIT_HH__ + +#include <deque> +#include <map> +#include <unordered_map> +#include <vector> + +#include "base/callback.hh" +#include "base/statistics.hh" +#include "base/types.hh" +#include "enums/PrefetchType.hh" +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/fetch_stage.hh" +#include "gpu-compute/global_memory_pipeline.hh" +#include "gpu-compute/local_memory_pipeline.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/schedule_stage.hh" +#include "gpu-compute/scoreboard_check_stage.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" + +static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; +static const int MAX_WIDTH_FOR_MEM_INST = 32; + +class NDRange; +class Shader; +class VectorRegisterFile; + +struct ComputeUnitParams; + +enum EXEC_POLICY +{ + OLDEST = 0, + RR +}; + +// List of execution units +enum EXEC_UNIT +{ + SIMD0 = 0, + SIMD1, + SIMD2, + SIMD3, + GLBMEM_PIPE, + LDSMEM_PIPE, + NUM_UNITS +}; + +enum TLB_CACHE +{ + TLB_MISS_CACHE_MISS = 0, + TLB_MISS_CACHE_HIT, + TLB_HIT_CACHE_MISS, + TLB_HIT_CACHE_HIT +}; + +class ComputeUnit : public MemObject +{ + public: + FetchStage fetchStage; + ScoreboardCheckStage scoreboardCheckStage; + ScheduleStage scheduleStage; + ExecStage execStage; + GlobalMemPipeline globalMemoryPipe; + LocalMemPipeline localMemoryPipe; + + // Buffers used to communicate between various pipeline stages + + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list. readyList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: make enum to index readyList + std::vector<std::vector<Wavefront*>> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList. waveStatusList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: convert std::pair to a class to increase readability + std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + // TODO: convert std::pair to a class to increase readability + std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList; + + int rrNextMemID; // used by RR WF exec policy to cycle through WF's + int rrNextALUWp; + typedef ComputeUnitParams Params; + std::vector<std::vector<Wavefront*>> wfList; + int cu_id; + + // array of vector register files, one per SIMD + std::vector<VectorRegisterFile*> vrf; + // Number of vector ALU units (SIMDs) in CU + int numSIMDs; + // number of pipe stages for bypassing data to next dependent single + // precision vector instruction inside the vector ALU pipeline + int spBypassPipeLength; + // number of pipe stages for bypassing data to next dependent double + // precision vector instruction inside the vector ALU pipeline + int dpBypassPipeLength; + // number of cycles per issue period + int issuePeriod; + + // Number of global and local memory execution resources in CU + int numGlbMemUnits; + int numLocMemUnits; + // tracks the last cycle a vector instruction was executed on a SIMD + std::vector<uint64_t> lastExecCycle; + + // true if we allow a separate TLB per lane + bool perLaneTLB; + // if 0, TLB prefetching is off. + int prefetchDepth; + // if fixed-stride prefetching, this is the stride. + int prefetchStride; + + class LastVaddrWave + { + public: + Addr vaddrs[VSZ]; + Addr& operator[](int idx) { + return vaddrs[idx]; + } + + LastVaddrWave() { + for (int i = 0; i < VSZ; ++i) + vaddrs[i] = 0; + } + }; + + LastVaddrWave lastVaddrCU; + std::vector<LastVaddrWave> lastVaddrPhase; + std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; + Enums::PrefetchType prefetchType; + EXEC_POLICY exec_policy; + + bool xact_cas_mode; + bool debugSegFault; + bool functionalTLB; + bool localMemBarrier; + + /* + * for Counting page accesses + * + * cuExitCallback inherits from Callback. When you register a callback + * function as an exit callback, it will get added to an exit callback + * queue, such that on simulation exit, all callbacks in the callback + * queue will have their process() function called. + */ + bool countPages; + + Shader *shader; + uint32_t barrier_id; + // vector of Vector ALU (MACC) pipelines + std::vector<WaitClass> aluPipe; + // minimum issue period per SIMD unit (in cycles) + std::vector<WaitClass> wfWait; + + // Resource control for Vector Register File->Global Memory pipe buses + std::vector<WaitClass> vrfToGlobalMemPipeBus; + // Resource control for Vector Register File->Local Memory pipe buses + std::vector<WaitClass> vrfToLocalMemPipeBus; + int nextGlbMemBus; + int nextLocMemBus; + // Resource control for global memory to VRF data/address bus + WaitClass glbMemToVrfBus; + // Resource control for local memory to VRF data/address bus + WaitClass locMemToVrfBus; + + uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes + uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes + uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store + uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load + + Tick req_tick_latency; + Tick resp_tick_latency; + + // number of vector registers being reserved for each SIMD unit + std::vector<int> vectorRegsReserved; + // number of vector registers per SIMD unit + uint32_t numVecRegsPerSimd; + // Support for scheduling VGPR status update events + std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; + std::vector<uint64_t> timestampVec; + std::vector<uint8_t> statusVec; + + void + registerEvent(uint32_t simdId, + uint32_t regIdx, + uint32_t operandSize, + uint64_t when, + uint8_t newStatus) { + regIdxVec.push_back(std::make_pair(simdId, regIdx)); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + if (operandSize > 4) { + regIdxVec.push_back(std::make_pair(simdId, + ((regIdx + 1) % + numVecRegsPerSimd))); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + } + } + + void updateEvents(); + + // this hash map will keep track of page divergence + // per memory instruction per wavefront. The hash map + // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. + std::map<Addr, int> pagesTouched; + + ComputeUnit(const Params *p); + ~ComputeUnit(); + int spBypassLength() { return spBypassPipeLength; }; + int dpBypassLength() { return dpBypassPipeLength; }; + int storeBusLength() { return numCyclesPerStoreTransfer; }; + int loadBusLength() { return numCyclesPerLoadTransfer; }; + int wfSize() const { return wavefrontSize; }; + + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + void exec(); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void FillKernelState(Wavefront *w, NDRange *ndr); + + void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal); + + void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart); + + void StartWorkgroup(NDRange *ndr); + int ReadyWorkgroup(NDRange *ndr); + + bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } + bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } + bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } + int GlbMemUnitId() { return GLBMEM_PIPE; } + int ShrMemUnitId() { return LDSMEM_PIPE; } + int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } + int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } + /* This function cycles through all the wavefronts in all the phases to see + * if all of the wavefronts which should be associated with one barrier + * (denoted with _barrier_id), are all at the same barrier in the program + * (denoted by bcnt). When the number at the barrier matches bslots, then + * return true. + */ + int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); + bool cedeSIMD(int simdId, int wfSlotId); + + template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); + virtual void init(); + void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, + bool kernelLaunch=true, + RequestPtr req=nullptr); + void handleMemPacket(PacketPtr pkt, int memport_index); + bool processTimingPacket(PacketPtr pkt); + void processFetchReturn(PacketPtr pkt); + void updatePageDivergenceDist(Addr addr); + + MasterID masterId() { return _masterId; } + + bool isDone() const; + bool isSimdDone(uint32_t) const; + + protected: + MasterID _masterId; + + LdsState &lds; + + public: + // the following stats compute the avg. TLB accesslatency per + // uncoalesced request (only for data) + Stats::Scalar tlbRequests; + Stats::Scalar tlbCycles; + Stats::Formula tlbLatency; + // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. + Stats::Vector hitsPerTLBLevel; + + Stats::Scalar ldsBankAccesses; + Stats::Distribution ldsBankConflictDist; + + // over all memory instructions executed over all wavefronts + // how many touched 0-4 pages, 4-8, ..., 60-64 pages + Stats::Distribution pageDivergenceDist; + Stats::Scalar dynamicGMemInstrCnt; + Stats::Scalar dynamicLMemInstrCnt; + + Stats::Scalar wgBlockedDueLdsAllocation; + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active + // when the instruction is committed, this number is still incremented by 1 + Stats::Scalar numInstrExecuted; + // Number of cycles among successive instruction executions across all + // wavefronts of the same CU + Stats::Distribution execRateDist; + // number of individual vector operations executed + Stats::Scalar numVecOpsExecuted; + // Total cycles that something is running on the GPU + Stats::Scalar totalCycles; + Stats::Formula vpc; // vector ops per cycle + Stats::Formula ipc; // vector instructions per cycle + Stats::Distribution controlFlowDivergenceDist; + Stats::Distribution activeLanesPerGMemInstrDist; + Stats::Distribution activeLanesPerLMemInstrDist; + // number of vector ALU instructions received + Stats::Formula numALUInstsExecuted; + // number of times a WG can not start due to lack of free VGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueVgprAlloc; + Stats::Scalar numCASOps; + Stats::Scalar numFailedCASOps; + Stats::Scalar completedWfs; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer: Defined in the Scoreboard stage, consumed + // by the Execute stage. + std::vector<bool> vectorAluInstAvail; + // number of available (oldest) LDS instructions that could have + // been issued to the LDS at a specific issue slot + int shrMemInstAvail; + // number of available Global memory instructions that could have + // been issued to TCP at a specific issue slot + int glbMemInstAvail; + + void + regStats(); + + LdsState & + getLds() const + { + return lds; + } + + int32_t + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; + + bool + sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); + + typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; + pageDataStruct pageAccesses; + + class CUExitCallback : public Callback + { + private: + ComputeUnit *computeUnit; + + public: + virtual ~CUExitCallback() { } + + CUExitCallback(ComputeUnit *_cu) + { + computeUnit = _cu; + } + + virtual void + process(); + }; + + CUExitCallback *cuExitCallback; + + /** Data access Port **/ + class DataPort : public MasterPort + { + public: + DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + GPUDynInstPtr _gpuDynInst; + int port_index; + Packet::SenderState *saved; + + SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, + Packet::SenderState *sender_state=nullptr) + : _gpuDynInst(gpuDynInst), + port_index(_port_index), + saved(sender_state) { } + }; + + class MemReqEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemReqEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + class MemRespEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemRespEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + + }; + + // Instruction cache access port + class SQCPort : public MasterPort + { + public: + SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + Wavefront *wavefront; + Packet::SenderState *saved; + + SenderState(Wavefront *_wavefront, Packet::SenderState + *sender_state=nullptr) + : wavefront(_wavefront), saved(sender_state) { } + }; + + std::deque<std::pair<PacketPtr, Wavefront*>> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + }; + + /** Data TLB port **/ + class DTLBPort : public MasterPort + { + public: + DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index), stalled(false) + { } + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque<PacketPtr> retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // the memInst that this is associated with + GPUDynInstPtr _gpuDynInst; + + // the lane in the memInst this is associated with, so we send + // the memory request down the right port + int portIndex; + + // constructor used for packets involved in timing accesses + SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) + : _gpuDynInst(gpuDynInst), portIndex(port_index) { } + + }; + + protected: + ComputeUnit *computeUnit; + int index; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + class ITLBPort : public MasterPort + { + public: + ITLBPort(const std::string &_name, ComputeUnit *_cu) + : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } + + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque<PacketPtr> retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // The wavefront associated with this request + Wavefront *wavefront; + + SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } + }; + + protected: + ComputeUnit *computeUnit; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + /** + * the port intended to communicate between the CU and its LDS + */ + class LDSPort : public MasterPort + { + public: + LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) + : MasterPort(_name, _cu, _id), computeUnit(_cu) + { + } + + bool isStalled() const { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the requests that were + * not successfully sent. + */ + std::queue<PacketPtr> retries; + + /** + * SenderState is information carried along with the packet, esp. the + * GPUDynInstPtr + */ + class SenderState: public Packet::SenderState + { + protected: + // The actual read/write/atomic request that goes with this command + GPUDynInstPtr _gpuDynInst = nullptr; + + public: + SenderState(GPUDynInstPtr gpuDynInst): + _gpuDynInst(gpuDynInst) + { + } + + GPUDynInstPtr + getMemInst() const + { + return _gpuDynInst; + } + }; + + virtual bool + sendTimingReq(PacketPtr pkt); + + protected: + + bool stalled = false; ///< whether or not it is stalled + + ComputeUnit *computeUnit; + + virtual bool + recvTimingResp(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) { return 0; } + + virtual void + recvFunctional(PacketPtr pkt) + { + } + + virtual void + recvRangeChange() + { + } + + virtual void + recvReqRetry(); + }; + + /** The port to access the Local Data Store + * Can be connected to a LDS object + */ + LDSPort *ldsPort = nullptr; + + LDSPort * + getLdsPort() const + { + return ldsPort; + } + + /** The memory port for SIMD data accesses. + * Can be connected to PhysMem for Ruby for timing simulations + */ + std::vector<DataPort*> memPort; + // port to the TLB hierarchy (i.e., the L1 TLB) + std::vector<DTLBPort*> tlbPort; + // port to the SQC (i.e. the I-cache) + SQCPort *sqcPort; + // port to the SQC TLB (there's a separate TLB for each I-cache) + ITLBPort *sqcTLBPort; + + virtual BaseMasterPort& + getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "memory_port") { + memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *memPort[idx]; + } else if (if_name == "translation_port") { + tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *tlbPort[idx]; + } else if (if_name == "sqc_port") { + sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *sqcPort; + } else if (if_name == "sqc_tlb_port") { + sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); + return *sqcTLBPort; + } else if (if_name == "ldsPort") { + if (ldsPort) { + fatal("an LDS port was already allocated"); + } + ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); + return *ldsPort; + } else { + panic("incorrect port name"); + } + } + + // xact_cas_load() + class waveIdentifier + { + public: + waveIdentifier() { } + waveIdentifier(int _simdId, int _wfSlotId) + : simdId(_simdId), wfSlotId(_wfSlotId) { } + + int simdId; + int wfSlotId; + }; + + class waveQueue + { + public: + std::list<waveIdentifier> waveIDQueue; + }; + std::map<unsigned, waveQueue> xactCasLoadMap; + + uint64_t getAndIncSeqNum() { return globalSeqNum++; } + + private: + uint64_t globalSeqNum; + int wavefrontSize; +}; + +#endif // __COMPUTE_UNIT_HH__ |