diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
commit | 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch) | |
tree | 867510a147cd095f19499d26b7c02d27de4cae9d /src/arch/hsail/insts/mem_impl.hh | |
parent | 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff) | |
download | gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz |
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/arch/hsail/insts/mem_impl.hh')
-rw-r--r-- | src/arch/hsail/insts/mem_impl.hh | 660 |
1 files changed, 660 insertions, 0 deletions
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh new file mode 100644 index 000000000..94f0cd6aa --- /dev/null +++ b/src/arch/hsail/insts/mem_impl.hh @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/generic_types.hh" +#include "gpu-compute/hsail_code.hh" + +// defined in code.cc, but not worth sucking in all of code.h for this +// at this point +extern const char *segmentNames[]; + +namespace HsailISA +{ + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly() + { + this->disassembly = csprintf("%s_%s %s,%s", this->opcode, + DestDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + } + + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename DestDataType::CType CType M5_VAR_USED; + const VectorMask &mask = w->get_pred(); + uint64_t addr_vec[VSZ]; + this->addr.calcVector(w, addr_vec); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, addr_vec[lane]); + } + } + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly() + { + switch (num_dest_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->dest_vect[2].disassemble(), + this->dest_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: + fatal("Bad ld register dest operand, num vector operands: %d \n", + num_dest_operands); + break; + } + } + + static Addr + calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i) + { + // what is the size of the object we are accessing?? + // NOTE: the compiler doesn't generate enough information + // to do this yet..have to just line up all the private + // work-item spaces back to back for now + /* + StorageElement* se = + i->parent->findSymbol(Brig::BrigPrivateSpace, addr); + assert(se); + + return w->wfSlotId * w->privSizePerItem * VSZ + + se->offset * VSZ + + lane * se->size; + */ + + // addressing strategy: interleave the private spaces of + // work-items in a wave-front on 8 byte granularity. + // this won't be perfect coalescing like the spill space + // strategy, but it's better than nothing. The spill space + // strategy won't work with private because the same address + // may be accessed by different sized loads/stores. + + // Note: I'm assuming that the largest load/store to private + // is 8 bytes. If it is larger, the stride will have to increase + + Addr addr_div8 = addr / 8; + Addr addr_mod8 = addr % 8; + + Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + + assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + + return ret; + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename MemDataType::CType MemCType; + const VectorMask &mask = w->get_pred(); + + // Kernarg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_KERNARG) { + MemCType val; + + // I assume no vector ld for kernargs + assert(num_dest_operands == 1); + + // assuming for the moment that we'll never do register + // offsets into kernarg space... just to make life simpler + uint64_t address = this->addr.calcUniform(); + + val = *(MemCType*)&w->kernelArgs[address]; + + DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, val); + } + } + + return; + } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + MemCType val = w->readCallArgMem<MemCType>(lane, address); + + DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address, + (unsigned long long)val); + + this->dest.set(w, lane, val); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + m->m_op = Enums::MO_LD; + m->m_type = MemDataType::memType; + m->v_type = DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = this->equivClass; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (num_dest_operands == 1) { + m->dst_reg = this->dest.regIndex(); + m->n_reg = 1; + } else { + m->n_reg = num_dest_operands; + for (int i = 0; i < num_dest_operands; ++i) { + m->dst_reg_vec[i] = this->dest_vect[i].regIndex(); + } + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (m->addr[lane] < w->privSizePerItem) { + if (mask[lane]) { + // what is the size of the object we are accessing? + // find base for for this wavefront + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(MemCType) - 1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_dest_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + // note: this calculation will NOT WORK if the compiler + // ever generates loads/stores to the same address with + // different widths (e.g., a ld_u32 addr and a ld_u16 addr) + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(MemCType) + w->spillBase; + + w->last_addr[lane] = m->addr[lane]; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_READONLY: + m->s_type = SEG_READONLY; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); + m->addr[lane] += w->roBase; + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + + m->addr[lane] = m->addr[lane] + + lane * sizeof(MemCType) + w->privBase; + } + } + } + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + default: + fatal("Load to unsupported segment %d %llxe\n", this->segment, + m->addr[0]); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename OperationType::CType CType; + + const VectorMask &mask = w->get_pred(); + + // arg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType data = this->src.template get<CType>(w, lane); + DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); + w->writeCallArgMem<CType>(lane, address, data); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + m->exec_mask = w->execMask(); + + this->addr.calcVector(w, m->addr); + + if (num_src_operands == 1) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[lane] = + this->src.template get<CType>(w, lane); + } + } + } else { + for (int k= 0; k < num_src_operands; ++k) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[k * VSZ + lane] = + this->src_vect[k].template get<CType>(w, lane); + } + } + } + } + + m->m_op = Enums::MO_ST; + m->m_type = OperationType::memType; + m->v_type = OperationType::vgprType; + + m->statusBitVector = 0; + m->equiv = this->equivClass; + + if (num_src_operands == 1) { + m->n_reg = 1; + } else { + m->n_reg = num_src_operands; + } + + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + if (m->addr[lane] < w->privSizePerItem) { + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(CType)-1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_src_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(CType) + w->spillBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + m->addr[lane] = m->addr[lane] + lane * + sizeof(CType)+w->privBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + default: + fatal("Store to unsupported segment %d\n", this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::generateDisassembly() + { + switch (num_src_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->src_vect[2].disassemble(), + this->src_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: fatal("Bad ld register src operand, num vector operands: " + "%d \n", num_src_operands); + break; + } + } + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::execute(GPUDynInstPtr gpuDynInst) + { + typedef typename DataType::CType CType; + + Wavefront *w = gpuDynInst->wavefront(); + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + for (int lane = 0; lane < VSZ; ++lane) { + ((CType *)m->a_data)[lane] = + this->src[0].template get<CType>(w, lane); + } + + // load second source operand for CAS + if (NumSrcOperands > 1) { + for (int lane = 0; lane < VSZ; ++lane) { + ((CType*)m->x_data)[lane] = + this->src[1].template get<CType>(w, lane); + } + } + + assert(NumSrcOperands <= 2); + + m->m_op = this->opType; + m->m_type = DataType::memType; + m->v_type = DataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (HasDst) { + m->dst_reg = this->dest.regIndex(); + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->latency.set(w->computeUnit->shader->ticks(64)); + m->pipeId = GLBMEM_PIPE; + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + default: + fatal("Atomic op to unsupported segment %d\n", + this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp); + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::generateDisassembly() + { + if (HasDst) { + this->disassembly = + csprintf("%s_%s_%s_%s %s,%s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->dest.disassemble(), + this->addr.disassemble()); + } else { + this->disassembly = + csprintf("%s_%s_%s_%s %s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->addr.disassemble()); + } + + for (int i = 0; i < NumSrcOperands; ++i) { + this->disassembly += ","; + this->disassembly += this->src[i].disassemble(); + } + } +} // namespace HsailISA |