/* * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: Lisa Hsu */ #ifndef __WAVEFRONT_HH__ #define __WAVEFRONT_HH__ #include #include #include #include #include #include "arch/gpu_isa.hh" #include "base/logging.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "gpu-compute/condition_register_state.hh" #include "gpu-compute/lds_state.hh" #include "gpu-compute/misc.hh" #include "gpu-compute/ndrange.hh" #include "params/Wavefront.hh" #include "sim/sim_object.hh" static const int MAX_NUM_INSTS_PER_WF = 12; /** * A reconvergence stack entry conveys the necessary state to implement * control flow divergence. */ struct ReconvergenceStackEntry { /** * PC of current instruction. */ uint32_t pc; /** * PC of the immediate post-dominator instruction, i.e., the value of * @a pc for the first instruction that will be executed by the wavefront * when a reconvergence point is reached. */ uint32_t rpc; /** * Execution mask. */ VectorMask execMask; }; /* * Arguments for the hsail opcode call, are user defined and variable length. * The hardware/finalizer can support arguments in hardware or use memory to * pass arguments. For now, let's assume that an unlimited number of arguments * are supported in hardware (the compiler inlines functions whenver it can * anyways, so unless someone is interested in the implications of linking/ * library functions, I think this is a reasonable assumption given the typical * size of an OpenCL kernel). * * Note that call args are different than kernel arguments: * * All work-items in a kernel refer the same set of kernel arguments * * Each work-item has it's on set of call args. So a call argument at * address 0x4 is different for work-item 0 and work-item 1. * * Ok, the table below shows an example of how we organize the call arguments in * the CallArgMem class. * * int foo(int arg1, double arg2) * ___________________________________________________ * | 0: return.0 | 4: return.1 | ... | 252: return.63 | * |---------------------------------------------------| * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | * |---------------------------------------------------| * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | * ___________________________________________________ */ class CallArgMem { public: // pointer to buffer for storing function arguments uint8_t *mem; int wfSize; // size of function args int funcArgsSizePerItem; template int getLaneOffset(int lane, int addr) { return addr * wfSize + sizeof(CType) * lane; } CallArgMem(int func_args_size_per_item, int wf_size) : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) { mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); } ~CallArgMem() { free(mem); } template uint8_t* getLaneAddr(int lane, int addr) { return mem + getLaneOffset(lane, addr); } template void setLaneAddr(int lane, int addr, CType val) { *((CType*)(mem + getLaneOffset(lane, addr))) = val; } }; class Wavefront : public SimObject { public: enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; // Base pointer for array of instruction pointers uint64_t basePtr; uint32_t oldBarrierCnt; uint32_t barrierCnt; uint32_t barrierId; uint32_t barrierSlots; status_e status; // HW slot id where the WF is mapped to inside a SIMD unit int wfSlotId; int kernId; // SIMD unit where the WV has been scheduled int simdId; // pointer to parent CU ComputeUnit *computeUnit; std::deque instructionBuffer; bool pendingFetch; bool dropFetch; // Condition Register State (for HSAIL simulations only) class ConditionRegisterState *condRegState; // number of single precision VGPRs required by WF uint32_t maxSpVgprs; // number of double precision VGPRs required by WF uint32_t maxDpVgprs; // map virtual to physical vector register uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); bool isGmInstruction(GPUDynInstPtr ii); bool isLmInstruction(GPUDynInstPtr ii); bool isOldestInstGMem(); bool isOldestInstLMem(); bool isOldestInstPrivMem(); bool isOldestInstFlatMem(); bool isOldestInstALU(); bool isOldestInstBarrier(); // used for passing spill address to DDInstGPU std::vector lastAddr; std::vector workItemId[3]; std::vector workItemFlatId; /* kernel launch parameters */ uint32_t workGroupId[3]; uint32_t workGroupSz[3]; uint32_t gridSz[3]; uint32_t wgId; uint32_t wgSz; /* the actual WG size can differ than the maximum size */ uint32_t actualWgSz[3]; uint32_t actualWgSzTotal; void computeActualWgSz(NDRange *ndr); // wavefront id within a workgroup uint32_t wfId; uint32_t maxDynWaveId; uint32_t dispatchId; // outstanding global+local memory requests uint32_t outstandingReqs; // memory requests between scoreboard // and execute stage not yet executed uint32_t memReqsInPipe; // outstanding global memory write requests uint32_t outstandingReqsWrGm; // outstanding local memory write requests uint32_t outstandingReqsWrLm; // outstanding global memory read requests uint32_t outstandingReqsRdGm; // outstanding local memory read requests uint32_t outstandingReqsRdLm; uint32_t rdLmReqsInPipe; uint32_t rdGmReqsInPipe; uint32_t wrLmReqsInPipe; uint32_t wrGmReqsInPipe; int memTraceBusy; uint64_t lastTrace; // number of vector registers reserved by WF int reservedVectorRegs; // Index into the Vector Register File's namespace where the WF's registers // will live while the WF is executed uint32_t startVgprIndex; // Old value of destination gpr (for trace) std::vector oldVgpr; // Id of destination gpr (for trace) uint32_t oldVgprId; // Tick count of last old_vgpr copy uint64_t oldVgprTcnt; // Old value of destination gpr (for trace) std::vector oldDgpr; // Id of destination gpr (for trace) uint32_t oldDgprId; // Tick count of last old_vgpr copy uint64_t oldDgprTcnt; // Execution mask at wavefront start VectorMask initMask; // number of barriers this WF has joined std::vector barCnt; int maxBarCnt; // Flag to stall a wave on barrier bool stalledAtBarrier; // a pointer to the fraction of the LDS allocated // to this workgroup (thus this wavefront) LdsChunk *ldsChunk; // A pointer to the spill area Addr spillBase; // The size of the spill area uint32_t spillSizePerItem; // The vector width of the spill area uint32_t spillWidth; // A pointer to the private memory area Addr privBase; // The size of the private memory area uint32_t privSizePerItem; // A pointer ot the read-only memory area Addr roBase; // size of the read-only memory area uint32_t roSize; // pointer to buffer for storing kernel arguments uint8_t *kernelArgs; // unique WF id over all WFs executed across all CUs uint64_t wfDynId; // number of times instruction issue for this wavefront is blocked // due to VRF port availability Stats::Scalar numTimesBlockedDueVrfPortAvail; // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueWAXDependencies; // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueRAWDependencies; // distribution of executed instructions based on their register // operands; this is used to highlight the load on the VRF Stats::Distribution srcRegOpDist; Stats::Distribution dstRegOpDist; // Functions to operate on call argument memory // argument memory for hsail call instruction CallArgMem *callArgMem; void initCallArgMem(int func_args_size_per_item, int wf_size) { callArgMem = new CallArgMem(func_args_size_per_item, wf_size); } template CType readCallArgMem(int lane, int addr) { return *((CType*)(callArgMem->getLaneAddr(lane, addr))); } template void writeCallArgMem(int lane, int addr, CType val) { callArgMem->setLaneAddr(lane, addr, val); } typedef WavefrontParams Params; Wavefront(const Params *p); ~Wavefront(); virtual void init(); void setParent(ComputeUnit *cu) { computeUnit = cu; } void start(uint64_t _wfDynId, uint64_t _base_ptr); void exec(); void updateResources(); int ready(itype_e type); bool instructionBufferHasBranch(); void regStats(); VectorMask getPred() { return execMask() & initMask; } bool waitingAtBarrier(int lane); void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask& exec_mask); void popFromReconvergenceStack(); uint32_t pc() const; uint32_t rpc() const; VectorMask execMask() const; bool execMask(int lane) const; void pc(uint32_t new_pc); void discardFetch(); /** * Returns the size of the static hardware context of a particular wavefront * This should be updated everytime the context is changed */ uint32_t getStaticContextSize() const; /** * Returns the hardware context as a stream of bytes * This method is designed for HSAIL execution */ void getContext(const void *out); /** * Sets the hardware context fromt a stream of bytes * This method is designed for HSAIL execution */ void setContext(const void *in); TheGpuISA::GPUISA& gpuISA() { return _gpuISA; } private: TheGpuISA::GPUISA _gpuISA; /** * Stack containing Control Flow Graph nodes (i.e., kernel instructions) * to be visited by the wavefront, and the associated execution masks. The * reconvergence stack grows every time the wavefront reaches a divergence * point (branch instruction), and shrinks every time the wavefront * reaches a reconvergence point (immediate post-dominator instruction). */ std::deque> reconvergenceStack; }; #endif // __WAVEFRONT_HH__