/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: John Kalamatianos, * Anthony Gutierrez */ #ifndef __COMPUTE_UNIT_HH__ #define __COMPUTE_UNIT_HH__ #include #include #include #include #include "base/callback.hh" #include "base/statistics.hh" #include "base/types.hh" #include "enums/PrefetchType.hh" #include "gpu-compute/exec_stage.hh" #include "gpu-compute/fetch_stage.hh" #include "gpu-compute/global_memory_pipeline.hh" #include "gpu-compute/local_memory_pipeline.hh" #include "gpu-compute/qstruct.hh" #include "gpu-compute/schedule_stage.hh" #include "gpu-compute/scoreboard_check_stage.hh" #include "mem/mem_object.hh" #include "mem/port.hh" static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; static const int MAX_WIDTH_FOR_MEM_INST = 32; class NDRange; class Shader; class VectorRegisterFile; struct ComputeUnitParams; enum EXEC_POLICY { OLDEST = 0, RR }; // List of execution units enum EXEC_UNIT { SIMD0 = 0, SIMD1, SIMD2, SIMD3, GLBMEM_PIPE, LDSMEM_PIPE, NUM_UNITS }; enum TLB_CACHE { TLB_MISS_CACHE_MISS = 0, TLB_MISS_CACHE_HIT, TLB_HIT_CACHE_MISS, TLB_HIT_CACHE_HIT }; class ComputeUnit : public MemObject { public: FetchStage fetchStage; ScoreboardCheckStage scoreboardCheckStage; ScheduleStage scheduleStage; ExecStage execStage; GlobalMemPipeline globalMemoryPipe; LocalMemPipeline localMemoryPipe; // Buffers used to communicate between various pipeline stages // List of waves which are ready to be scheduled. // Each execution resource has a ready list. readyList is // used to communicate between scoreboardCheck stage and // schedule stage // TODO: make enum to index readyList std::vector> readyList; // Stores the status of waves. A READY implies the // wave is ready to be scheduled this cycle and // is already present in the readyList. waveStatusList is // used to communicate between scoreboardCheck stage and // schedule stage // TODO: convert std::pair to a class to increase readability std::vector>> waveStatusList; // List of waves which will be dispatched to // each execution resource. A FILLED implies // dispatch list is non-empty and // execution unit has something to execute // this cycle. Currently, the dispatch list of // an execution resource can hold only one wave because // an execution resource can execute only one wave in a cycle. // dispatchList is used to communicate between schedule // and exec stage // TODO: convert std::pair to a class to increase readability std::vector> dispatchList; int rrNextMemID; // used by RR WF exec policy to cycle through WF's int rrNextALUWp; typedef ComputeUnitParams Params; std::vector> wfList; int cu_id; // array of vector register files, one per SIMD std::vector vrf; // Number of vector ALU units (SIMDs) in CU int numSIMDs; // number of pipe stages for bypassing data to next dependent single // precision vector instruction inside the vector ALU pipeline int spBypassPipeLength; // number of pipe stages for bypassing data to next dependent double // precision vector instruction inside the vector ALU pipeline int dpBypassPipeLength; // number of cycles per issue period int issuePeriod; // Number of global and local memory execution resources in CU int numGlbMemUnits; int numLocMemUnits; // tracks the last cycle a vector instruction was executed on a SIMD std::vector lastExecCycle; // true if we allow a separate TLB per lane bool perLaneTLB; // if 0, TLB prefetching is off. int prefetchDepth; // if fixed-stride prefetching, this is the stride. int prefetchStride; std::vector lastVaddrCU; std::vector> lastVaddrSimd; std::vector>> lastVaddrWF; Enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; bool xact_cas_mode; bool debugSegFault; bool functionalTLB; bool localMemBarrier; /* * for Counting page accesses * * cuExitCallback inherits from Callback. When you register a callback * function as an exit callback, it will get added to an exit callback * queue, such that on simulation exit, all callbacks in the callback * queue will have their process() function called. */ bool countPages; Shader *shader; uint32_t barrier_id; // vector of Vector ALU (MACC) pipelines std::vector aluPipe; // minimum issue period per SIMD unit (in cycles) std::vector wfWait; // Resource control for Vector Register File->Global Memory pipe buses std::vector vrfToGlobalMemPipeBus; // Resource control for Vector Register File->Local Memory pipe buses std::vector vrfToLocalMemPipeBus; int nextGlbMemBus; int nextLocMemBus; // Resource control for global memory to VRF data/address bus WaitClass glbMemToVrfBus; // Resource control for local memory to VRF data/address bus WaitClass locMemToVrfBus; uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load Tick req_tick_latency; Tick resp_tick_latency; // number of vector registers being reserved for each SIMD unit std::vector vectorRegsReserved; // number of vector registers per SIMD unit uint32_t numVecRegsPerSimd; // Support for scheduling VGPR status update events std::vector > regIdxVec; std::vector timestampVec; std::vector statusVec; void registerEvent(uint32_t simdId, uint32_t regIdx, uint32_t operandSize, uint64_t when, uint8_t newStatus) { regIdxVec.push_back(std::make_pair(simdId, regIdx)); timestampVec.push_back(when); statusVec.push_back(newStatus); if (operandSize > 4) { regIdxVec.push_back(std::make_pair(simdId, ((regIdx + 1) % numVecRegsPerSimd))); timestampVec.push_back(when); statusVec.push_back(newStatus); } } void updateEvents(); // this hash map will keep track of page divergence // per memory instruction per wavefront. The hash map // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. std::map pagesTouched; ComputeUnit(const Params *p); ~ComputeUnit(); int spBypassLength() { return spBypassPipeLength; }; int dpBypassLength() { return dpBypassPipeLength; }; int storeBusLength() { return numCyclesPerStoreTransfer; }; int loadBusLength() { return numCyclesPerLoadTransfer; }; int wfSize() const { return wavefrontSize; }; void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); void exec(); void initiateFetch(Wavefront *wavefront); void fetch(PacketPtr pkt, Wavefront *wavefront); void fillKernelState(Wavefront *w, NDRange *ndr); void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr); void StartWorkgroup(NDRange *ndr); int ReadyWorkgroup(NDRange *ndr); bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } int GlbMemUnitId() { return GLBMEM_PIPE; } int ShrMemUnitId() { return LDSMEM_PIPE; } int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } /* This function cycles through all the wavefronts in all the phases to see * if all of the wavefronts which should be associated with one barrier * (denoted with _barrier_id), are all at the same barrier in the program * (denoted by bcnt). When the number at the barrier matches bslots, then * return true. */ int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); bool cedeSIMD(int simdId, int wfSlotId); template void doSmReturn(GPUDynInstPtr gpuDynInst); virtual void init(); void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr); void handleMemPacket(PacketPtr pkt, int memport_index); bool processTimingPacket(PacketPtr pkt); void processFetchReturn(PacketPtr pkt); void updatePageDivergenceDist(Addr addr); MasterID masterId() { return _masterId; } bool isDone() const; bool isSimdDone(uint32_t) const; protected: MasterID _masterId; LdsState &lds; public: Stats::Scalar vALUInsts; Stats::Formula vALUInstsPerWF; Stats::Scalar sALUInsts; Stats::Formula sALUInstsPerWF; Stats::Scalar instCyclesVALU; Stats::Scalar instCyclesSALU; Stats::Scalar threadCyclesVALU; Stats::Formula vALUUtilization; Stats::Scalar ldsNoFlatInsts; Stats::Formula ldsNoFlatInstsPerWF; Stats::Scalar flatVMemInsts; Stats::Formula flatVMemInstsPerWF; Stats::Scalar flatLDSInsts; Stats::Formula flatLDSInstsPerWF; Stats::Scalar vectorMemWrites; Stats::Formula vectorMemWritesPerWF; Stats::Scalar vectorMemReads; Stats::Formula vectorMemReadsPerWF; Stats::Scalar scalarMemWrites; Stats::Formula scalarMemWritesPerWF; Stats::Scalar scalarMemReads; Stats::Formula scalarMemReadsPerWF; void updateInstStats(GPUDynInstPtr gpuDynInst); // the following stats compute the avg. TLB accesslatency per // uncoalesced request (only for data) Stats::Scalar tlbRequests; Stats::Scalar tlbCycles; Stats::Formula tlbLatency; // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. Stats::Vector hitsPerTLBLevel; Stats::Scalar ldsBankAccesses; Stats::Distribution ldsBankConflictDist; // over all memory instructions executed over all wavefronts // how many touched 0-4 pages, 4-8, ..., 60-64 pages Stats::Distribution pageDivergenceDist; Stats::Scalar dynamicGMemInstrCnt; Stats::Scalar dynamicLMemInstrCnt; Stats::Scalar wgBlockedDueLdsAllocation; // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active // when the instruction is committed, this number is still incremented by 1 Stats::Scalar numInstrExecuted; // Number of cycles among successive instruction executions across all // wavefronts of the same CU Stats::Distribution execRateDist; // number of individual vector operations executed Stats::Scalar numVecOpsExecuted; // Total cycles that something is running on the GPU Stats::Scalar totalCycles; Stats::Formula vpc; // vector ops per cycle Stats::Formula ipc; // vector instructions per cycle Stats::Distribution controlFlowDivergenceDist; Stats::Distribution activeLanesPerGMemInstrDist; Stats::Distribution activeLanesPerLMemInstrDist; // number of vector ALU instructions received Stats::Formula numALUInstsExecuted; // number of times a WG can not start due to lack of free VGPRs in SIMDs Stats::Scalar numTimesWgBlockedDueVgprAlloc; Stats::Scalar numCASOps; Stats::Scalar numFailedCASOps; Stats::Scalar completedWfs; // flag per vector SIMD unit that is set when there is at least one // WV that has a vector ALU instruction as the oldest in its // Instruction Buffer: Defined in the Scoreboard stage, consumed // by the Execute stage. std::vector vectorAluInstAvail; // number of available (oldest) LDS instructions that could have // been issued to the LDS at a specific issue slot int shrMemInstAvail; // number of available Global memory instructions that could have // been issued to TCP at a specific issue slot int glbMemInstAvail; void regStats(); LdsState & getLds() const { return lds; } int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; int cacheLineSize() const { return _cacheLineSize; } bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); typedef std::unordered_map> pageDataStruct; pageDataStruct pageAccesses; class CUExitCallback : public Callback { private: ComputeUnit *computeUnit; public: virtual ~CUExitCallback() { } CUExitCallback(ComputeUnit *_cu) { computeUnit = _cu; } virtual void process(); }; CUExitCallback *cuExitCallback; /** Data access Port **/ class DataPort : public MasterPort { public: DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) : MasterPort(_name, _cu), computeUnit(_cu), index(_index) { } bool snoopRangeSent; struct SenderState : public Packet::SenderState { GPUDynInstPtr _gpuDynInst; int port_index; Packet::SenderState *saved; SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr) : _gpuDynInst(gpuDynInst), port_index(_port_index), saved(sender_state) { } }; void processMemReqEvent(PacketPtr pkt); EventFunctionWrapper *createMemReqEvent(PacketPtr pkt); void processMemRespEvent(PacketPtr pkt); EventFunctionWrapper *createMemRespEvent(PacketPtr pkt); std::deque> retries; protected: ComputeUnit *computeUnit; int index; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) { resp.clear(); snoop = true; } }; // Instruction cache access port class SQCPort : public MasterPort { public: SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) : MasterPort(_name, _cu), computeUnit(_cu), index(_index) { } bool snoopRangeSent; struct SenderState : public Packet::SenderState { Wavefront *wavefront; Packet::SenderState *saved; SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr) : wavefront(_wavefront), saved(sender_state) { } }; std::deque> retries; protected: ComputeUnit *computeUnit; int index; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) { resp.clear(); snoop = true; } }; /** Data TLB port **/ class DTLBPort : public MasterPort { public: DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) : MasterPort(_name, _cu), computeUnit(_cu), index(_index), stalled(false) { } bool isStalled() { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the translation requests that were * not successfully sent. */ std::deque retries; /** SenderState is information carried along with the packet * throughout the TLB hierarchy */ struct SenderState: public Packet::SenderState { // the memInst that this is associated with GPUDynInstPtr _gpuDynInst; // the lane in the memInst this is associated with, so we send // the memory request down the right port int portIndex; // constructor used for packets involved in timing accesses SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) : _gpuDynInst(gpuDynInst), portIndex(port_index) { } }; protected: ComputeUnit *computeUnit; int index; bool stalled; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; class ITLBPort : public MasterPort { public: ITLBPort(const std::string &_name, ComputeUnit *_cu) : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } bool isStalled() { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the translation requests that were * not successfully sent. */ std::deque retries; /** SenderState is information carried along with the packet * throughout the TLB hierarchy */ struct SenderState: public Packet::SenderState { // The wavefront associated with this request Wavefront *wavefront; SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } }; protected: ComputeUnit *computeUnit; bool stalled; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; /** * the port intended to communicate between the CU and its LDS */ class LDSPort : public MasterPort { public: LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) : MasterPort(_name, _cu, _id), computeUnit(_cu) { } bool isStalled() const { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the requests that were * not successfully sent. */ std::queue retries; /** * SenderState is information carried along with the packet, esp. the * GPUDynInstPtr */ class SenderState: public Packet::SenderState { protected: // The actual read/write/atomic request that goes with this command GPUDynInstPtr _gpuDynInst = nullptr; public: SenderState(GPUDynInstPtr gpuDynInst): _gpuDynInst(gpuDynInst) { } GPUDynInstPtr getMemInst() const { return _gpuDynInst; } }; virtual bool sendTimingReq(PacketPtr pkt); protected: bool stalled = false; ///< whether or not it is stalled ComputeUnit *computeUnit; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; /** The port to access the Local Data Store * Can be connected to a LDS object */ LDSPort *ldsPort = nullptr; LDSPort * getLdsPort() const { return ldsPort; } /** The memory port for SIMD data accesses. * Can be connected to PhysMem for Ruby for timing simulations */ std::vector memPort; // port to the TLB hierarchy (i.e., the L1 TLB) std::vector tlbPort; // port to the SQC (i.e. the I-cache) SQCPort *sqcPort; // port to the SQC TLB (there's a separate TLB for each I-cache) ITLBPort *sqcTLBPort; virtual BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx) { if (if_name == "memory_port") { memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), this, idx); return *memPort[idx]; } else if (if_name == "translation_port") { tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), this, idx); return *tlbPort[idx]; } else if (if_name == "sqc_port") { sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), this, idx); return *sqcPort; } else if (if_name == "sqc_tlb_port") { sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); return *sqcTLBPort; } else if (if_name == "ldsPort") { if (ldsPort) { fatal("an LDS port was already allocated"); } ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); return *ldsPort; } else { panic("incorrect port name"); } } // xact_cas_load() class waveIdentifier { public: waveIdentifier() { } waveIdentifier(int _simdId, int _wfSlotId) : simdId(_simdId), wfSlotId(_wfSlotId) { } int simdId; int wfSlotId; }; class waveQueue { public: std::list waveIDQueue; }; std::map xactCasLoadMap; uint64_t getAndIncSeqNum() { return globalSeqNum++; } private: const int _cacheLineSize; uint64_t globalSeqNum; int wavefrontSize; GPUStaticInst *kernelLaunchInst; }; #endif // __COMPUTE_UNIT_HH__