diff options
28 files changed, 1205 insertions, 1064 deletions
diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript index 3455823a6..251c103fd 100644 --- a/src/arch/hsail/SConscript +++ b/src/arch/hsail/SConscript @@ -43,7 +43,6 @@ if env['TARGET_GPU_ISA'] == 'hsail': env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'], 'gen.py', '$SOURCE $TARGETS') - Source('generic_types.cc') Source('gpu_decoder.cc') Source('insts/branch.cc') Source('insts/gen_exec.cc') diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc deleted file mode 100644 index 0cd55d1d5..000000000 --- a/src/arch/hsail/generic_types.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "arch/hsail/generic_types.hh" -#include "base/misc.hh" - -using namespace Brig; - -namespace HsailISA -{ - Enums::GenericMemoryOrder - getGenericMemoryOrder(BrigMemoryOrder brig_memory_order) - { - switch(brig_memory_order) { - case BRIG_MEMORY_ORDER_NONE: - return Enums::MEMORY_ORDER_NONE; - case BRIG_MEMORY_ORDER_RELAXED: - return Enums::MEMORY_ORDER_RELAXED; - case BRIG_MEMORY_ORDER_SC_ACQUIRE: - return Enums::MEMORY_ORDER_SC_ACQUIRE; - case BRIG_MEMORY_ORDER_SC_RELEASE: - return Enums::MEMORY_ORDER_SC_RELEASE; - case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: - return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE; - default: - fatal("HsailISA::MemInst::getGenericMemoryOrder -> ", - "bad BrigMemoryOrder\n"); - } - } - - Enums::GenericMemoryScope - getGenericMemoryScope(BrigMemoryScope brig_memory_scope) - { - switch(brig_memory_scope) { - case BRIG_MEMORY_SCOPE_NONE: - return Enums::MEMORY_SCOPE_NONE; - case BRIG_MEMORY_SCOPE_WORKITEM: - return Enums::MEMORY_SCOPE_WORKITEM; - case BRIG_MEMORY_SCOPE_WORKGROUP: - return Enums::MEMORY_SCOPE_WORKGROUP; - case BRIG_MEMORY_SCOPE_AGENT: - return Enums::MEMORY_SCOPE_DEVICE; - case BRIG_MEMORY_SCOPE_SYSTEM: - return Enums::MEMORY_SCOPE_SYSTEM; - default: - fatal("HsailISA::MemInst::getGenericMemoryScope -> ", - "bad BrigMemoryScope\n"); - } - } -} // namespace HsailISA diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh deleted file mode 100644 index 50e430bef..000000000 --- a/src/arch/hsail/generic_types.hh +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__ -#define __ARCH_HSAIL_GENERIC_TYPES_HH__ - -#include "arch/hsail/Brig.h" -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" - -namespace HsailISA -{ - Enums::GenericMemoryOrder - getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order); - Enums::GenericMemoryScope - getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope); -} // namespace HsailISA - -#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__ diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh index 45cd876ad..89bcc1277 100644 --- a/src/arch/hsail/insts/branch.hh +++ b/src/arch/hsail/insts/branch.hh @@ -59,16 +59,15 @@ namespace HsailISA BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "brn") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); + setFlag(UnconditionalJump); width = ((Brig::BrigInstBr*)ib)->width; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } - bool unconditionalJumpInstruction() override { return true; } bool isVectorRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return target.isVectorRegister(); @@ -175,13 +174,12 @@ namespace HsailISA CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "cbr") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); width = ((Brig::BrigInstBr *)ib)->width; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); cond.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } @@ -343,17 +341,15 @@ namespace HsailISA BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "br") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); + setFlag(UnconditionalJump); width.init(((Brig::BrigInstBr *)ib)->width, obj); unsigned op_offs = obj->getOperandPtr(ib->operands, 0); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } - bool unconditionalJumpInstruction() override { return true; } - void execute(GPUDynInstPtr gpuDynInst) override; bool isVectorRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh index 48e022ff7..94f23ac1f 100644 --- a/src/arch/hsail/insts/decl.hh +++ b/src/arch/hsail/insts/decl.hh @@ -38,11 +38,9 @@ #include <cmath> -#include "arch/hsail/generic_types.hh" #include "arch/hsail/insts/gpu_static_inst.hh" #include "arch/hsail/operand.hh" #include "debug/HSAIL.hh" -#include "enums/OpType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -127,6 +125,8 @@ namespace HsailISA const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -240,6 +240,8 @@ namespace HsailISA const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -414,6 +416,8 @@ namespace HsailISA const BrigObject *obj, const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -818,6 +822,8 @@ namespace HsailISA const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -874,7 +880,7 @@ namespace HsailISA Ret(const Brig::BrigInstBase *ib, const BrigObject *obj) : Base(ib, obj, "ret") { - o_type = Enums::OT_RET; + setFlag(GPUStaticInst::Return); } void execute(GPUDynInstPtr gpuDynInst); @@ -889,7 +895,7 @@ namespace HsailISA Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj) : Base(ib, obj, "barrier") { - o_type = Enums::OT_BARRIER; + setFlag(GPUStaticInst::MemBarrier); assert(ib->base.kind == Brig::BRIG_KIND_INST_BR); width = (uint8_t)((Brig::BrigInstBr*)ib)->width; } @@ -924,14 +930,105 @@ namespace HsailISA memFenceMemOrder = (Brig::BrigMemoryOrder) ((Brig::BrigInstMemFence*)ib)->memoryOrder; - // set o_type based on scopes + setFlag(MemoryRef); + setFlag(GPUStaticInst::MemFence); + + switch (memFenceMemOrder) { + case Brig::BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case Brig::BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case Brig::BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; + default: + fatal("MemInst has bad BrigMemoryOrder\n"); + } + + // set inst flags based on scopes if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE && memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_BOTH_MEMFENCE; + setFlag(GPUStaticInst::GlobalSegment); + + /** + * A memory fence that has scope for + * both segments will use the global + * segment, and be executed in the + * global memory pipeline, therefore, + * we set the segment to match the + * global scope only + */ + switch (memFenceScopeSegGlobal) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad global scope type\n"); + } } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_GLOBAL_MEMFENCE; + setFlag(GPUStaticInst::GlobalSegment); + + switch (memFenceScopeSegGlobal) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad global scope type\n"); + } } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_SHARED_MEMFENCE; + setFlag(GPUStaticInst::GroupSegment); + + switch (memFenceScopeSegGroup) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad group scope type\n"); + } } else { fatal("MemFence constructor: bad scope specifiers\n"); } @@ -955,18 +1052,13 @@ namespace HsailISA // etc.). We send a packet, tagged with the memory order and // scope, and let the GPU coalescer handle it. - if (o_type == Enums::OT_GLOBAL_MEMFENCE || - o_type == Enums::OT_BOTH_MEMFENCE) { + if (isGlobalSeg()) { gpuDynInst->simdId = w->simdId; gpuDynInst->wfSlotId = w->wfSlotId; gpuDynInst->wfDynId = w->wfDynId; gpuDynInst->kern_id = w->kernId; gpuDynInst->cu_id = w->computeUnit->cu_id; - gpuDynInst->memoryOrder = - getGenericMemoryOrder(memFenceMemOrder); - gpuDynInst->scope = - getGenericMemoryScope(memFenceScopeSegGlobal); gpuDynInst->useContinuation = false; GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); gmp->getGMReqFIFO().push(gpuDynInst); @@ -975,10 +1067,10 @@ namespace HsailISA w->rdGmReqsInPipe--; w->memReqsInPipe--; w->outstandingReqs++; - } else if (o_type == Enums::OT_SHARED_MEMFENCE) { + } else if (isGroupSeg()) { // no-op } else { - fatal("MemFence execute: bad o_type\n"); + fatal("MemFence execute: bad op type\n"); } } }; @@ -1054,6 +1146,7 @@ namespace HsailISA Call(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "call") { + setFlag(ALU); unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc index f1662430a..783689dd5 100644 --- a/src/arch/hsail/insts/main.cc +++ b/src/arch/hsail/insts/main.cc @@ -179,12 +179,13 @@ namespace HsailISA w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId); if (!refCount) { + setFlag(SystemScope); + setFlag(Release); + setFlag(GlobalSegment); // Notify Memory System of Kernel Completion // Kernel End = isKernel + isRelease w->status = Wavefront::S_RETURNING; GPUDynInstPtr local_mempacket = gpuDynInst; - local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE; - local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM; local_mempacket->useContinuation = false; local_mempacket->simdId = w->simdId; local_mempacket->wfSlotId = w->wfSlotId; diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc index 97d4c902b..6a6928838 100644 --- a/src/arch/hsail/insts/mem.cc +++ b/src/arch/hsail/insts/mem.cc @@ -36,7 +36,6 @@ #include "arch/hsail/insts/mem.hh" #include "arch/hsail/Brig.h" -#include "enums/OpType.hh" using namespace Brig; @@ -44,68 +43,6 @@ namespace HsailISA { const char* atomicOpToString(BrigAtomicOperation brigOp); - Enums::MemOpType - brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp) - { - if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) { - switch (brigOp) { - case BRIG_ATOMIC_AND: - return Enums::MO_AAND; - case BRIG_ATOMIC_OR: - return Enums::MO_AOR; - case BRIG_ATOMIC_XOR: - return Enums::MO_AXOR; - case BRIG_ATOMIC_CAS: - return Enums::MO_ACAS; - case BRIG_ATOMIC_EXCH: - return Enums::MO_AEXCH; - case BRIG_ATOMIC_ADD: - return Enums::MO_AADD; - case BRIG_ATOMIC_WRAPINC: - return Enums::MO_AINC; - case BRIG_ATOMIC_WRAPDEC: - return Enums::MO_ADEC; - case BRIG_ATOMIC_MIN: - return Enums::MO_AMIN; - case BRIG_ATOMIC_MAX: - return Enums::MO_AMAX; - case BRIG_ATOMIC_SUB: - return Enums::MO_ASUB; - default: - fatal("Bad BrigAtomicOperation code %d\n", brigOp); - } - } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) { - switch (brigOp) { - case BRIG_ATOMIC_AND: - return Enums::MO_ANRAND; - case BRIG_ATOMIC_OR: - return Enums::MO_ANROR; - case BRIG_ATOMIC_XOR: - return Enums::MO_ANRXOR; - case BRIG_ATOMIC_CAS: - return Enums::MO_ANRCAS; - case BRIG_ATOMIC_EXCH: - return Enums::MO_ANREXCH; - case BRIG_ATOMIC_ADD: - return Enums::MO_ANRADD; - case BRIG_ATOMIC_WRAPINC: - return Enums::MO_ANRINC; - case BRIG_ATOMIC_WRAPDEC: - return Enums::MO_ANRDEC; - case BRIG_ATOMIC_MIN: - return Enums::MO_ANRMIN; - case BRIG_ATOMIC_MAX: - return Enums::MO_ANRMAX; - case BRIG_ATOMIC_SUB: - return Enums::MO_ANRSUB; - default: - fatal("Bad BrigAtomicOperation code %d\n", brigOp); - } - } else { - fatal("Bad BrigAtomicOpcode %d\n", brigOpCode); - } - } - const char* atomicOpToString(BrigAtomicOperation brigOp) { diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh index acc8434be..e223c7cf5 100644 --- a/src/arch/hsail/insts/mem.hh +++ b/src/arch/hsail/insts/mem.hh @@ -96,6 +96,8 @@ namespace HsailISA { using namespace Brig; + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); @@ -211,143 +213,119 @@ namespace HsailISA Brig::BrigMemoryOrder memoryOrder; Brig::BrigMemoryScope memoryScope; unsigned int equivClass; - bool isArgLoad() - { - return segment == Brig::BRIG_SEGMENT_KERNARG || - segment == Brig::BRIG_SEGMENT_ARG; - } - void - initLd(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) + + LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; - const BrigInstMem *ldst = (const BrigInstMem*)ib; + setFlag(MemoryRef); + setFlag(Load); - segment = (BrigSegment)ldst->segment; - memoryOrder = BRIG_MEMORY_ORDER_NONE; - memoryScope = BRIG_MEMORY_SCOPE_NONE; - equivClass = ldst->equivClass; + if (ib->opcode == BRIG_OPCODE_LD) { + const BrigInstMem *ldst = (const BrigInstMem*)ib; - switch (segment) { - case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_READ; - break; + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_READ; - break; + width = ldst->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_READ; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } else { + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_READ; - break; + segment = (BrigSegment)at->segment; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + memoryScope = (BrigMemoryScope)at->memoryScope; + equivClass = 0; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_READ; - break; + width = BRIG_WIDTH_1; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_READ; - break; + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); - case BRIG_SEGMENT_KERNARG: - o_type = Enums::OT_KERN_READ; - break; + op_offs = obj->getOperandPtr(ib->operands,1); + addr.init(op_offs, obj); + } - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); break; - default: - panic("Ld: segment %d not supported\n", segment); + fatal("LdInst has bad memory order type\n"); } - width = ldst->width; - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) - dest.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands, 1); - addr.init(op_offs, obj); - } - - void - initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - { - using namespace Brig; - - const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - - segment = (BrigSegment)at->segment; - memoryOrder = (BrigMemoryOrder)at->memoryOrder; - memoryScope = (BrigMemoryScope)at->memoryScope; - equivClass = 0; + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("LdInst has bad memory scope type\n"); + } switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_READ; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_READ; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_READ; + setFlag(PrivateSegment); break; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_READ; + setFlag(ReadOnlySegment); break; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_READ; + setFlag(SpillSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_READ; + setFlag(Flat); break; - case BRIG_SEGMENT_KERNARG: - o_type = Enums::OT_KERN_READ; + setFlag(KernArgSegment); break; - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + setFlag(ArgSegment); break; - default: panic("Ld: segment %d not supported\n", segment); } - - width = BRIG_WIDTH_1; - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - - if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) - dest.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands,1); - addr.init(op_offs, obj); - } - - LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - : HsailGPUStaticInst(obj, _opcode) - { - using namespace Brig; - - if (ib->opcode == BRIG_OPCODE_LD) { - initLd(ib, obj, _opcode); - } else { - initAtomicLd(ib, obj, _opcode); - } } int numSrcRegOperands() override @@ -473,7 +451,7 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); - if (isLocalMem()) { + if (this->isLocalMem()) { // load from shared memory *d = gpuDynInst->wavefront()->ldsChunk-> read<c0>(vaddr); @@ -488,8 +466,7 @@ namespace HsailISA if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && - gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->isAcquire()) { // if this load has acquire semantics, // set the response continuation function // to perform an Acquire request @@ -520,10 +497,9 @@ namespace HsailISA { // after the load has complete and if the load has acquire // semantics, issue an acquire request. - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->useContinuation = false; // create request @@ -537,12 +513,6 @@ namespace HsailISA } public: - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); @@ -731,128 +701,113 @@ namespace HsailISA Brig::BrigMemoryOrder memoryOrder; unsigned int equivClass; - void - initSt(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) + StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; - const BrigInstMem *ldst = (const BrigInstMem*)ib; + setFlag(MemoryRef); + setFlag(Store); - segment = (BrigSegment)ldst->segment; - memoryOrder = BRIG_MEMORY_ORDER_NONE; - memoryScope = BRIG_MEMORY_SCOPE_NONE; - equivClass = ldst->equivClass; + if (ib->opcode == BRIG_OPCODE_ST) { + const BrigInstMem *ldst = (const BrigInstMem*)ib; - switch (segment) { - case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_WRITE; - break; + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_WRITE; - break; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const BrigOperand *baseOp = obj->getOperand(op_offs); - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_WRITE; - break; + if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || + (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { + src.init(op_offs, obj); + } - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_WRITE; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } else { + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_WRITE; - break; + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + equivClass = 0; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_WRITE; - break; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + src.init(op_offs, obj); + } + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; default: - panic("St: segment %d not supported\n", segment); + fatal("StInst has bad memory order type\n"); } - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const BrigOperand *baseOp = obj->getOperand(op_offs); - - if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || - (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { - src.init(op_offs, obj); + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("StInst has bad memory scope type\n"); } - op_offs = obj->getOperandPtr(ib->operands, 1); - addr.init(op_offs, obj); - } - - void - initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - { - using namespace Brig; - - const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - - segment = (BrigSegment)at->segment; - memoryScope = (BrigMemoryScope)at->memoryScope; - memoryOrder = (BrigMemoryOrder)at->memoryOrder; - equivClass = 0; - switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_WRITE; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_WRITE; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_WRITE; + setFlag(PrivateSegment); break; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_WRITE; + setFlag(ReadOnlySegment); break; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_WRITE; + setFlag(SpillSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_WRITE; + setFlag(Flat); break; - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + setFlag(ArgSegment); break; - default: panic("St: segment %d not supported\n", segment); } - - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - addr.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands, 1); - src.init(op_offs, obj); - } - - StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - : HsailGPUStaticInst(obj, _opcode) - { - using namespace Brig; - - if (ib->opcode == BRIG_OPCODE_ST) { - initSt(ib, obj, _opcode); - } else { - initAtomicSt(ib, obj, _opcode); - } } int numDstRegOperands() override { return 0; } @@ -964,10 +919,9 @@ namespace HsailISA { // before performing a store, check if this store has // release semantics, and if so issue a release first - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_RELEASE) { + && gpuDynInst->isRelease()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->execContinuation = &GPUStaticInst::execSt; @@ -987,12 +941,6 @@ namespace HsailISA execSt(gpuDynInst); } - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - private: // execSt may be called through a continuation // if the store had release semantics. see comment for @@ -1020,7 +968,7 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); - if (isLocalMem()) { + if (this->isLocalMem()) { //store to shared memory gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, *d); @@ -1166,9 +1114,6 @@ namespace HsailISA } } - Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode, - Brig::BrigAtomicOperation brigOp); - template<typename OperandType, typename AddrOperandType, int NumSrcOperands, bool HasDst> class AtomicInstBase : public HsailGPUStaticInst @@ -1183,7 +1128,6 @@ namespace HsailISA Brig::BrigAtomicOperation atomicOperation; Brig::BrigMemoryScope memoryScope; Brig::BrigOpcode opcode; - Enums::MemOpType opType; AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) @@ -1198,21 +1142,106 @@ namespace HsailISA memoryOrder = (BrigMemoryOrder)at->memoryOrder; atomicOperation = (BrigAtomicOperation)at->atomicOperation; opcode = (BrigOpcode)ib->opcode; - opType = brigAtomicToMemOpType(opcode, atomicOperation); + + assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || + opcode == Brig::BRIG_OPCODE_ATOMIC); + + setFlag(MemoryRef); + + if (opcode == Brig::BRIG_OPCODE_ATOMIC) { + setFlag(AtomicReturn); + } else { + setFlag(AtomicNoReturn); + } + + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; + default: + fatal("AtomicInst has bad memory order type\n"); + } + + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("AtomicInst has bad memory scope type\n"); + } + + switch (atomicOperation) { + case Brig::BRIG_ATOMIC_AND: + setFlag(AtomicAnd); + break; + case Brig::BRIG_ATOMIC_OR: + setFlag(AtomicOr); + break; + case Brig::BRIG_ATOMIC_XOR: + setFlag(AtomicXor); + break; + case Brig::BRIG_ATOMIC_CAS: + setFlag(AtomicCAS); + break; + case Brig::BRIG_ATOMIC_EXCH: + setFlag(AtomicExch); + break; + case Brig::BRIG_ATOMIC_ADD: + setFlag(AtomicAdd); + break; + case Brig::BRIG_ATOMIC_WRAPINC: + setFlag(AtomicInc); + break; + case Brig::BRIG_ATOMIC_WRAPDEC: + setFlag(AtomicDec); + break; + case Brig::BRIG_ATOMIC_MIN: + setFlag(AtomicMin); + break; + case Brig::BRIG_ATOMIC_MAX: + setFlag(AtomicMax); + break; + case Brig::BRIG_ATOMIC_SUB: + setFlag(AtomicSub); + break; + default: + fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); + } switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_ATOMIC; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_ATOMIC; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_ATOMIC; + setFlag(Flat); break; - default: panic("Atomic: segment %d not supported\n", segment); } @@ -1354,11 +1383,10 @@ namespace HsailISA { // before doing the RMW, check if this atomic has // release semantics, and if so issue a release first - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && (gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) { + && (gpuDynInst->isRelease() + || gpuDynInst->isAcquireRelease())) { gpuDynInst->statusBitVector = VectorMask(1); @@ -1383,12 +1411,6 @@ namespace HsailISA void execute(GPUDynInstPtr gpuDynInst) override; - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - private: // execAtomic may be called through a continuation // if the RMW had release semantics. see comment for @@ -1408,72 +1430,48 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i]; - if (isLocalMem()) { + if (this->isLocalMem()) { Wavefront *wavefront = gpuDynInst->wavefront(); *d = wavefront->ldsChunk->read<c0>(vaddr); - switch (this->opType) { - case Enums::MO_AADD: - case Enums::MO_ANRADD: + if (this->isAtomicAdd()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) + (*e)); - break; - case Enums::MO_ASUB: - case Enums::MO_ANRSUB: + } else if (this->isAtomicSub()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) - (*e)); - break; - case Enums::MO_AMAX: - case Enums::MO_ANRMAX: + } else if (this->isAtomicMax()) { wavefront->ldsChunk->write<c0>(vaddr, std::max(wavefront->ldsChunk->read<c0>(vaddr), (*e))); - break; - case Enums::MO_AMIN: - case Enums::MO_ANRMIN: + } else if (this->isAtomicMin()) { wavefront->ldsChunk->write<c0>(vaddr, std::min(wavefront->ldsChunk->read<c0>(vaddr), (*e))); - break; - case Enums::MO_AAND: - case Enums::MO_ANRAND: + } else if (this->isAtomicAnd()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) & (*e)); - break; - case Enums::MO_AOR: - case Enums::MO_ANROR: + } else if (this->isAtomicOr()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) | (*e)); - break; - case Enums::MO_AXOR: - case Enums::MO_ANRXOR: + } else if (this->isAtomicXor()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); - break; - case Enums::MO_AINC: - case Enums::MO_ANRINC: + } else if (this->isAtomicInc()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) + 1); - break; - case Enums::MO_ADEC: - case Enums::MO_ANRDEC: + } else if (this->isAtomicDec()) { wavefront->ldsChunk->write<c0>(vaddr, wavefront->ldsChunk->read<c0>(vaddr) - 1); - break; - case Enums::MO_AEXCH: - case Enums::MO_ANREXCH: + } else if (this->isAtomicExch()) { wavefront->ldsChunk->write<c0>(vaddr, (*e)); - break; - case Enums::MO_ACAS: - case Enums::MO_ANRCAS: + } else if (this->isAtomicCAS()) { wavefront->ldsChunk->write<c0>(vaddr, (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? (*f) : wavefront->ldsChunk->read<c0>(vaddr)); - break; - default: + } else { fatal("Unrecognized or invalid HSAIL atomic op " "type.\n"); - break; } } else { Request *req = @@ -1481,7 +1479,7 @@ namespace HsailISA gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId, gpuDynInst->makeAtomicOpFunctor<c0>(e, - f, this->opType)); + f)); gpuDynInst->setRequestFlags(req); PacketPtr pkt = new Packet(req, MemCmd::SwapReq); @@ -1489,8 +1487,7 @@ namespace HsailISA if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && - (gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE)) { + (gpuDynInst->isAcquire())) { // if this atomic has acquire semantics, // schedule the continuation to perform an // acquire after the RMW completes @@ -1523,10 +1520,9 @@ namespace HsailISA { // after performing the RMW, check to see if this instruction // has acquire semantics, and if so, issue an acquire - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); // the request will be finished when diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh index e3529f914..c175f2782 100644 --- a/src/arch/hsail/insts/mem_impl.hh +++ b/src/arch/hsail/insts/mem_impl.hh @@ -33,7 +33,6 @@ * Author: Steve Reinhardt */ -#include "arch/hsail/generic_types.hh" #include "gpu-compute/hsail_code.hh" // defined in code.cc, but not worth sucking in all of code.h for this @@ -215,16 +214,12 @@ namespace HsailISA this->addr.calcVector(w, m->addr); - m->m_op = Enums::MO_LD; m->m_type = MemDataType::memType; m->v_type = DestDataType::vgprType; m->exec_mask = w->execMask(); m->statusBitVector = 0; m->equiv = this->equivClass; - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); if (num_dest_operands == 1) { m->dst_reg = this->dest.regIndex(); @@ -245,7 +240,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -276,7 +270,6 @@ namespace HsailISA case Brig::BRIG_SEGMENT_SPILL: assert(num_dest_operands == 1); - m->s_type = SEG_SPILL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -301,7 +294,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); @@ -310,7 +302,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_READONLY: - m->s_type = SEG_READONLY; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -327,7 +318,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_PRIVATE: - m->s_type = SEG_PRIVATE; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -408,7 +398,6 @@ namespace HsailISA } } - m->m_op = Enums::MO_ST; m->m_type = OperationType::memType; m->v_type = OperationType::vgprType; @@ -421,10 +410,6 @@ namespace HsailISA m->n_reg = num_src_operands; } - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); - m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; @@ -434,7 +419,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -463,7 +447,6 @@ namespace HsailISA case Brig::BRIG_SEGMENT_SPILL: assert(num_src_operands == 1); - m->s_type = SEG_SPILL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -483,7 +466,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); @@ -492,7 +474,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_PRIVATE: - m->s_type = SEG_PRIVATE; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -586,7 +567,6 @@ namespace HsailISA assert(NumSrcOperands <= 2); - m->m_op = this->opType; m->m_type = DataType::memType; m->v_type = DataType::vgprType; @@ -594,9 +574,6 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); if (HasDst) { m->dst_reg = this->dest.regIndex(); @@ -611,7 +588,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->latency.set(w->computeUnit->shader->ticks(64)); m->pipeId = GLBMEM_PIPE; @@ -623,7 +599,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc index 2bfc5aaad..bfffb7d8f 100644 --- a/src/arch/hsail/insts/pseudo_inst.cc +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -627,8 +627,12 @@ namespace HsailISA ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3); } - m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, - Brig::BRIG_ATOMIC_ADD); + setFlag(AtomicNoReturn); + setFlag(AtomicAdd); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; m->v_type = U32::vgprType; @@ -636,15 +640,12 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); @@ -666,8 +667,12 @@ namespace HsailISA ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1); } - m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, - Brig::BRIG_ATOMIC_ADD); + setFlag(AtomicNoReturn); + setFlag(AtomicAdd); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; m->v_type = U32::vgprType; @@ -675,15 +680,12 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); @@ -702,7 +704,11 @@ namespace HsailISA // calculate the address calcAddr(w, m); - m->m_op = Enums::MO_LD; + setFlag(Load); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; //MemDataType::memType; m->v_type = U32::vgprType; //DestDataType::vgprType; @@ -710,8 +716,6 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; // FIXME //m->dst_reg = this->dest.regIndex(); @@ -721,7 +725,6 @@ namespace HsailISA m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index f580a09f7..b672f616c 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -171,56 +171,6 @@ class GpuDispatcher(DmaDevice): cl_driver = Param.ClDriver('pointer to driver') -class OpType(Enum): vals = [ - 'OT_NULL', - 'OT_ALU', - 'OT_SPECIAL', - 'OT_GLOBAL_READ', - 'OT_GLOBAL_WRITE', - 'OT_GLOBAL_ATOMIC', - 'OT_GLOBAL_HIST', - 'OT_GLOBAL_LDAS', - 'OT_SHARED_READ', - 'OT_SHARED_WRITE', - 'OT_SHARED_ATOMIC', - 'OT_SHARED_HIST', - 'OT_SHARED_LDAS', - 'OT_PRIVATE_READ', - 'OT_PRIVATE_WRITE', - 'OT_PRIVATE_ATOMIC', - 'OT_PRIVATE_HIST', - 'OT_PRIVATE_LDAS', - 'OT_SPILL_READ', - 'OT_SPILL_WRITE', - 'OT_SPILL_ATOMIC', - 'OT_SPILL_HIST', - 'OT_SPILL_LDAS', - 'OT_READONLY_READ', - 'OT_READONLY_WRITE', - 'OT_READONLY_ATOMIC', - 'OT_READONLY_HIST', - 'OT_READONLY_LDAS', - 'OT_FLAT_READ', - 'OT_FLAT_WRITE', - 'OT_FLAT_ATOMIC', - 'OT_FLAT_HIST', - 'OT_FLAT_LDAS', - 'OT_KERN_READ', - 'OT_BRANCH', - - # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version - # of the compiler. - 'OT_SHARED_MEMFENCE', - 'OT_GLOBAL_MEMFENCE', - 'OT_BOTH_MEMFENCE', - - 'OT_BARRIER', - 'OT_PRINT', - 'OT_RET', - 'OT_NOP', - 'OT_ARG' - ] - class MemType(Enum): vals = [ 'M_U8', 'M_U16', @@ -235,47 +185,6 @@ class MemType(Enum): vals = [ 'M_F64', ] -class MemOpType(Enum): vals = [ - 'MO_LD', - 'MO_ST', - 'MO_LDAS', - 'MO_LDA', - 'MO_AAND', - 'MO_AOR', - 'MO_AXOR', - 'MO_ACAS', - 'MO_AEXCH', - 'MO_AADD', - 'MO_ASUB', - 'MO_AINC', - 'MO_ADEC', - 'MO_AMAX', - 'MO_AMIN', - 'MO_ANRAND', - 'MO_ANROR', - 'MO_ANRXOR', - 'MO_ANRCAS', - 'MO_ANREXCH', - 'MO_ANRADD', - 'MO_ANRSUB', - 'MO_ANRINC', - 'MO_ANRDEC', - 'MO_ANRMAX', - 'MO_ANRMIN', - 'MO_HAND', - 'MO_HOR', - 'MO_HXOR', - 'MO_HCAS', - 'MO_HEXCH', - 'MO_HADD', - 'MO_HSUB', - 'MO_HINC', - 'MO_HDEC', - 'MO_HMAX', - 'MO_HMIN', - 'MO_UNDEF' - ] - class StorageClassType(Enum): vals = [ 'SC_SPILL', 'SC_GLOBAL', @@ -293,20 +202,3 @@ class RegisterType(Enum): vals = [ 'RT_HARDWARE', 'RT_NONE', ] - -class GenericMemoryOrder(Enum): vals = [ - 'MEMORY_ORDER_NONE', - 'MEMORY_ORDER_RELAXED', - 'MEMORY_ORDER_SC_ACQUIRE', - 'MEMORY_ORDER_SC_RELEASE', - 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', - ] - -class GenericMemoryScope(Enum): vals = [ - 'MEMORY_SCOPE_NONE', - 'MEMORY_SCOPE_WORKITEM', - 'MEMORY_SCOPE_WAVEFRONT', - 'MEMORY_SCOPE_WORKGROUP', - 'MEMORY_SCOPE_DEVICE', - 'MEMORY_SCOPE_SYSTEM', - ] diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py new file mode 100644 index 000000000..453fdced2 --- /dev/null +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -0,0 +1,111 @@ +# Copyright (c) 2016 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Anthony Gutierrez + +from m5.params import * + +class GPUStaticInstFlags(Enum): + wrapper_name = 'GPUStaticInstFlags' + wrapper_is_struct = True + enum_name = 'Flags' + + vals = [ + # Op types + 'ALU', # ALU op + 'Branch', # Branch instruction + 'Nop', # No-op (no effect at all) + 'Return', # Return instruction + 'UnconditionalJump', # + 'SpecialOp', # Special op + 'Waitcnt', # Is a waitcnt instruction + + # Memory ops + 'MemBarrier', # Barrier instruction + 'MemFence', # Memory fence instruction + 'MemoryRef', # References memory (load, store, or atomic) + 'Flat', # Flat memory op + 'Load', # Reads from memory + 'Store', # Writes to memory + + # Atomic ops + 'AtomicReturn', # Atomic instruction that returns data + 'AtomicNoReturn', # Atomic instruction that doesn't return data + + # Instruction attributes + 'Scalar', # A scalar (not vector) operation + 'ReadsSCC', # The instruction reads SCC + 'WritesSCC', # The instruction writes SCC + 'ReadsVCC', # The instruction reads VCC + 'WritesVCC', # The instruction writes VCC + + # Atomic OP types + 'AtomicAnd', + 'AtomicOr', + 'AtomicXor', + 'AtomicCAS', + 'AtomicExch', + 'AtomicAdd', + 'AtomicSub', + 'AtomicInc', + 'AtomicDec', + 'AtomicMax', + 'AtomicMin', + + # Memory order flags + 'RelaxedOrder', + 'Acquire', # Has acquire semantics + 'Release', # Has release semantics + 'AcquireRelease', # Has acquire and release semantics + 'NoOrder', # Has no ordering restrictions + + # Segment access flags + 'ArgSegment', # Accesses the arg segment + 'GlobalSegment', # Accesses global memory + 'GroupSegment', # Accesses local memory (LDS), aka shared memory + 'KernArgSegment', # Accesses the kernel argument segment + 'PrivateSegment', # Accesses the private segment + 'ReadOnlySegment', # Accesses read only memory + 'SpillSegment', # Accesses the spill segment + 'NoSegment', # Does not have an associated segment + + # Scope flags + 'WorkitemScope', + 'WavefrontScope', + 'WorkgroupScope', + 'DeviceScope', + 'SystemScope', + 'NoScope', # Does not have an associated scope + + # Coherence flags + 'GloballyCoherent', # Coherent with other workitems on same device + 'SystemCoherent' # Coherent with a different device, or the host + ] diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 88c1cf036..8cf1ed8cf 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ if not env['BUILD_GPU']: Return() SimObject('GPU.py') +SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh deleted file mode 100644 index 6cd9bfe26..000000000 --- a/src/gpu-compute/code_enums.hh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2015 Advanced Micro Devices, Inc. - * All rights reserved. - * - * For use for simulation and test purposes only - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Anthony Gutierrez - */ - -#ifndef __CODE_ENUMS_HH__ -#define __CODE_ENUMS_HH__ - -#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ - && (a)<=Enums::OT_GLOBAL_LDAS) -#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ - && (a)<=Enums::OT_SHARED_LDAS) -#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ - && (a)<=Enums::OT_PRIVATE_LDAS) -#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ - && (a)<=Enums::OT_SPILL_LDAS) -#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ - && (a)<=Enums::OT_READONLY_LDAS) -#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) - -#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ - ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ - ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) - -#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ - ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) - -#define IS_OT_READ_GM(a) \ - ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) - -#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) - -#define IS_OT_WRITE(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ - ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) - -#define IS_OT_WRITE_GM(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE) - -#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) - -#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) - -#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_PRIVATE_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_BOTH_MEMFENCE \ - ||(a)==Enums::OT_FLAT_ATOMIC) - -#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_GLOBAL_MEMFENCE \ - ||(a)==Enums::OT_BOTH_MEMFENCE) - -#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_SHARED_MEMFENCE) - -#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) - -#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SHARED_HIST \ - ||(a)==Enums::OT_PRIVATE_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST \ - ||(a)==Enums::OT_FLAT_HIST) - -#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST) - -#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) - -#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) - -#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 97e018713..abf8ff2c5 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -75,7 +75,8 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), - lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize), + kernelLaunchInst(new KernelLaunchStaticInst()) { /** * This check is necessary because std::bitset only provides conversion @@ -316,13 +317,11 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this, - nullptr, - nullptr, 0); + GPUDynInstPtr gpuDynInst = + std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst, + getAndIncSeqNum()); gpuDynInst->useContinuation = false; - gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; - gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; injectGlobalMemFence(gpuDynInst, true); } @@ -647,7 +646,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) gpuDynInst->wfSlotId, w->barrierCnt); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -658,7 +657,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) return true; } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -942,6 +941,8 @@ void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, Request* req) { + assert(gpuDynInst->isGlobalSeg()); + if (!req) { req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } @@ -950,8 +951,6 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, req->setFlags(Request::KERNEL); } - gpuDynInst->s_type = SEG_GLOBAL; - // for non-kernel MemFence operations, memorder flags are set depending // on which type of request is currently being sent, so this // should be set by the caller (e.g. if an inst has acq-rel @@ -1033,8 +1032,7 @@ ComputeUnit::DataPort::MemRespEvent::process() if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) - || MO_ANR(gpuDynInst->m_op)) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); compute_unit->globalMemoryPipe.getGMLdRespFIFO() @@ -1055,7 +1053,7 @@ ComputeUnit::DataPort::MemRespEvent::process() // the continuation may generate more work for // this memory request if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -1065,7 +1063,7 @@ ComputeUnit::DataPort::MemRespEvent::process() gpuDynInst->statusBitVector = VectorMask(0); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index a3547402a..938658fd1 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -744,6 +744,7 @@ class ComputeUnit : public MemObject private: uint64_t globalSeqNum; int wavefrontSize; + GPUStaticInst *kernelLaunchInst; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 102905ec8..ab3e8c47e 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -67,7 +67,7 @@ GlobalMemPipeline::exec() bool accessVrf = true; // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -127,10 +127,7 @@ GlobalMemPipeline::exec() // memory packets to DTLB if (!gmIssuedRequests.empty()) { GPUDynInstPtr mp = gmIssuedRequests.front(); - if (mp->m_op == Enums::MO_LD || - (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || - (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { - + if (mp->isLoad() || mp->isAtomic()) { if (inflightLoads >= gmQueueSize) { return; } else { @@ -139,7 +136,7 @@ GlobalMemPipeline::exec() } else { if (inflightStores >= gmQueueSize) { return; - } else if (mp->m_op == Enums::MO_ST) { + } else if (mp->isStore()) { ++inflightStores; } } @@ -147,9 +144,8 @@ GlobalMemPipeline::exec() mp->initiateAcc(mp); gmIssuedRequests.pop(); - DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId, - Enums::MemOpTypeStrings[mp->m_op]); + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId); } } @@ -160,12 +156,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { gmReturnedLoads.pop(); assert(inflightLoads > 0); --inflightLoads; - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; // iterate over number of destination register operands since // this is a load or atomic operation @@ -214,13 +210,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || - MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 1806e79e4..ec6340360 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -41,11 +41,10 @@ #include "gpu-compute/wavefront.hh" GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, - GPUStaticInst *_staticInst, uint64_t instSeqNum) + GPUStaticInst *static_inst, uint64_t instSeqNum) : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), - m_op(Enums::MO_UNDEF), - memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), - statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) + n_reg(0), useContinuation(false), + statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum) { tlbHitLevel.assign(computeUnit()->wfSize(), -1); d_data = new uint8_t[computeUnit()->wfSize() * 16]; @@ -68,77 +67,69 @@ GPUDynInst::~GPUDynInst() } void -GPUDynInst::execute() +GPUDynInst::execute(GPUDynInstPtr gpuDynInst) { - GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst, - _seqNum); - staticInst->execute(gpuDynInst); + _staticInst->execute(gpuDynInst); } int GPUDynInst::numSrcRegOperands() { - return staticInst->numSrcRegOperands(); + return _staticInst->numSrcRegOperands(); } int GPUDynInst::numDstRegOperands() { - return staticInst->numDstRegOperands(); + return _staticInst->numDstRegOperands(); } int GPUDynInst::getNumOperands() { - return staticInst->getNumOperands(); + return _staticInst->getNumOperands(); } bool GPUDynInst::isVectorRegister(int operandIdx) { - return staticInst->isVectorRegister(operandIdx); + return _staticInst->isVectorRegister(operandIdx); } bool GPUDynInst::isScalarRegister(int operandIdx) { - return staticInst->isScalarRegister(operandIdx); + return _staticInst->isScalarRegister(operandIdx); } int GPUDynInst::getRegisterIndex(int operandIdx) { - return staticInst->getRegisterIndex(operandIdx); + return _staticInst->getRegisterIndex(operandIdx); } int GPUDynInst::getOperandSize(int operandIdx) { - return staticInst->getOperandSize(operandIdx); + return _staticInst->getOperandSize(operandIdx); } bool GPUDynInst::isDstOperand(int operandIdx) { - return staticInst->isDstOperand(operandIdx); + return _staticInst->isDstOperand(operandIdx); } bool GPUDynInst::isSrcOperand(int operandIdx) { - return staticInst->isSrcOperand(operandIdx); -} - -bool -GPUDynInst::isArgLoad() -{ - return staticInst->isArgLoad(); + return _staticInst->isSrcOperand(operandIdx); } const std::string& GPUDynInst::disassemble() const { - return staticInst->disassemble(); + return _staticInst->disassemble(); } uint64_t @@ -147,16 +138,10 @@ GPUDynInst::seqNum() const return _seqNum; } -Enums::OpType -GPUDynInst::opType() -{ - return staticInst->o_type; -} - Enums::StorageClassType GPUDynInst::executedAs() { - return staticInst->executed_as; + return _staticInst->executed_as; } // Process a memory instruction and (if necessary) submit timing request @@ -166,20 +151,347 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", cu->cu_id, simdId, wfSlotId, exec_mask); - staticInst->initiateAcc(gpuDynInst); + _staticInst->initiateAcc(gpuDynInst); time = 0; } +/** + * accessor methods for the attributes of + * the underlying GPU static instruction + */ +bool +GPUDynInst::isALU() const +{ + return _staticInst->isALU(); +} + +bool +GPUDynInst::isBranch() const +{ + return _staticInst->isBranch(); +} + +bool +GPUDynInst::isNop() const +{ + return _staticInst->isNop(); +} + +bool +GPUDynInst::isReturn() const +{ + return _staticInst->isReturn(); +} + +bool +GPUDynInst::isUnconditionalJump() const +{ + return _staticInst->isUnconditionalJump(); +} + +bool +GPUDynInst::isSpecialOp() const +{ + return _staticInst->isSpecialOp(); +} + +bool +GPUDynInst::isWaitcnt() const +{ + return _staticInst->isWaitcnt(); +} + +bool +GPUDynInst::isBarrier() const +{ + return _staticInst->isBarrier(); +} + +bool +GPUDynInst::isMemFence() const +{ + return _staticInst->isMemFence(); +} + +bool +GPUDynInst::isMemRef() const +{ + return _staticInst->isMemRef(); +} + +bool +GPUDynInst::isFlat() const +{ + return _staticInst->isFlat(); +} + +bool +GPUDynInst::isLoad() const +{ + return _staticInst->isLoad(); +} + +bool +GPUDynInst::isStore() const +{ + return _staticInst->isStore(); +} + +bool +GPUDynInst::isAtomic() const +{ + return _staticInst->isAtomic(); +} + +bool +GPUDynInst::isAtomicNoRet() const +{ + return _staticInst->isAtomicNoRet(); +} + +bool +GPUDynInst::isAtomicRet() const +{ + return _staticInst->isAtomicRet(); +} + +bool +GPUDynInst::isScalar() const +{ + return _staticInst->isScalar(); +} + +bool +GPUDynInst::readsSCC() const +{ + return _staticInst->readsSCC(); +} + +bool +GPUDynInst::writesSCC() const +{ + return _staticInst->writesSCC(); +} + +bool +GPUDynInst::readsVCC() const +{ + return _staticInst->readsVCC(); +} + +bool +GPUDynInst::writesVCC() const +{ + return _staticInst->writesVCC(); +} + +bool +GPUDynInst::isAtomicAnd() const +{ + return _staticInst->isAtomicAnd(); +} + +bool +GPUDynInst::isAtomicOr() const +{ + return _staticInst->isAtomicOr(); +} + +bool +GPUDynInst::isAtomicXor() const +{ + return _staticInst->isAtomicXor(); +} + +bool +GPUDynInst::isAtomicCAS() const +{ + return _staticInst->isAtomicCAS(); +} + +bool GPUDynInst::isAtomicExch() const +{ + return _staticInst->isAtomicExch(); +} + +bool +GPUDynInst::isAtomicAdd() const +{ + return _staticInst->isAtomicAdd(); +} + +bool +GPUDynInst::isAtomicSub() const +{ + return _staticInst->isAtomicSub(); +} + +bool +GPUDynInst::isAtomicInc() const +{ + return _staticInst->isAtomicInc(); +} + +bool +GPUDynInst::isAtomicDec() const +{ + return _staticInst->isAtomicDec(); +} + +bool +GPUDynInst::isAtomicMax() const +{ + return _staticInst->isAtomicMax(); +} + +bool +GPUDynInst::isAtomicMin() const +{ + return _staticInst->isAtomicMin(); +} + +bool +GPUDynInst::isArgLoad() const +{ + return _staticInst->isArgLoad(); +} + +bool +GPUDynInst::isGlobalMem() const +{ + return _staticInst->isGlobalMem(); +} + +bool +GPUDynInst::isLocalMem() const +{ + return _staticInst->isLocalMem(); +} + +bool +GPUDynInst::isArgSeg() const +{ + return _staticInst->isArgSeg(); +} + +bool +GPUDynInst::isGlobalSeg() const +{ + return _staticInst->isGlobalSeg(); +} + +bool +GPUDynInst::isGroupSeg() const +{ + return _staticInst->isGroupSeg(); +} + +bool +GPUDynInst::isKernArgSeg() const +{ + return _staticInst->isKernArgSeg(); +} + +bool +GPUDynInst::isPrivateSeg() const +{ + return _staticInst->isPrivateSeg(); +} + +bool +GPUDynInst::isReadOnlySeg() const +{ + return _staticInst->isReadOnlySeg(); +} + +bool +GPUDynInst::isSpillSeg() const +{ + return _staticInst->isSpillSeg(); +} + +bool +GPUDynInst::isWorkitemScope() const +{ + return _staticInst->isWorkitemScope(); +} + +bool +GPUDynInst::isWavefrontScope() const +{ + return _staticInst->isWavefrontScope(); +} + +bool +GPUDynInst::isWorkgroupScope() const +{ + return _staticInst->isWorkgroupScope(); +} + +bool +GPUDynInst::isDeviceScope() const +{ + return _staticInst->isDeviceScope(); +} + +bool +GPUDynInst::isSystemScope() const +{ + return _staticInst->isSystemScope(); +} + +bool +GPUDynInst::isNoScope() const +{ + return _staticInst->isNoScope(); +} + +bool +GPUDynInst::isRelaxedOrder() const +{ + return _staticInst->isRelaxedOrder(); +} + +bool +GPUDynInst::isAcquire() const +{ + return _staticInst->isAcquire(); +} + +bool +GPUDynInst::isRelease() const +{ + return _staticInst->isRelease(); +} + +bool +GPUDynInst::isAcquireRelease() const +{ + return _staticInst->isAcquireRelease(); +} + +bool +GPUDynInst::isNoOrder() const +{ + return _staticInst->isNoOrder(); +} + +bool +GPUDynInst::isGloballyCoherent() const +{ + return _staticInst->isGloballyCoherent(); +} + bool -GPUDynInst::scalarOp() const +GPUDynInst::isSystemCoherent() const { - return staticInst->scalarOp(); + return _staticInst->isSystemCoherent(); } void GPUDynInst::updateStats() { - if (staticInst->isLocalMem()) { + if (_staticInst->isLocalMem()) { // access to LDS (shared) memory cu->dynamicLMemInstrCnt++; } else { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 46774d867..c07d85d78 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,11 +39,7 @@ #include <cstdint> #include <string> -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" -#include "enums/OpType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -180,33 +176,19 @@ class AtomicOpMin : public TypedAtomicOpFunctor<T> } }; -#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) -#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) -#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) - typedef enum { VT_32, VT_64, } vgpr_type; -typedef enum -{ - SEG_PRIVATE, - SEG_SPILL, - SEG_GLOBAL, - SEG_SHARED, - SEG_READONLY, - SEG_FLAT -} seg_type; - class GPUDynInst : public GPUExecContext { public: - GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); - void execute(); + void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); int getNumOperands(); @@ -216,13 +198,11 @@ class GPUDynInst : public GPUExecContext int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); - bool isArgLoad(); const std::string &disassemble() const; uint64_t seqNum() const; - Enums::OpType opType(); Enums::StorageClassType executedAs(); // The address of the memory operation @@ -240,14 +220,7 @@ class GPUDynInst : public GPUExecContext // The memory type (M_U32, M_S32, ...) Enums::MemType m_type; - // The memory operation (MO_LD, MO_ST, ...) - Enums::MemOpType m_op; - Enums::GenericMemoryOrder memoryOrder; - - // Scope of the request - Enums::GenericMemoryScope scope; - // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) - seg_type s_type; + // The equivalency class int equiv; // The return VGPR type (VT_32 or VT_64) @@ -288,10 +261,72 @@ class GPUDynInst : public GPUExecContext void updateStats(); - GPUStaticInst* staticInstruction() { return staticInst; } - - // Is the instruction a scalar or vector op? - bool scalarOp() const; + GPUStaticInst* staticInstruction() { return _staticInst; } + + bool isALU() const; + bool isBranch() const; + bool isNop() const; + bool isReturn() const; + bool isUnconditionalJump() const; + bool isSpecialOp() const; + bool isWaitcnt() const; + + bool isBarrier() const; + bool isMemFence() const; + bool isMemRef() const; + bool isFlat() const; + bool isLoad() const; + bool isStore() const; + + bool isAtomic() const; + bool isAtomicNoRet() const; + bool isAtomicRet() const; + + bool isScalar() const; + bool readsSCC() const; + bool writesSCC() const; + bool readsVCC() const; + bool writesVCC() const; + + bool isAtomicAnd() const; + bool isAtomicOr() const; + bool isAtomicXor() const; + bool isAtomicCAS() const; + bool isAtomicExch() const; + bool isAtomicAdd() const; + bool isAtomicSub() const; + bool isAtomicInc() const; + bool isAtomicDec() const; + bool isAtomicMax() const; + bool isAtomicMin() const; + + bool isArgLoad() const; + bool isGlobalMem() const; + bool isLocalMem() const; + + bool isArgSeg() const; + bool isGlobalSeg() const; + bool isGroupSeg() const; + bool isKernArgSeg() const; + bool isPrivateSeg() const; + bool isReadOnlySeg() const; + bool isSpillSeg() const; + + bool isWorkitemScope() const; + bool isWavefrontScope() const; + bool isWorkgroupScope() const; + bool isDeviceScope() const; + bool isSystemScope() const; + bool isNoScope() const; + + bool isRelaxedOrder() const; + bool isAcquire() const; + bool isRelease() const; + bool isAcquireRelease() const; + bool isNoOrder() const; + + bool isGloballyCoherent() const; + bool isSystemCoherent() const; /* * Loads/stores/atomics may have acquire/release semantics associated @@ -312,46 +347,32 @@ class GPUDynInst : public GPUExecContext bool useContinuation; template<typename c0> AtomicOpFunctor* - makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { - using namespace Enums; - - switch(op) { - case MO_AAND: - case MO_ANRAND: + if (isAtomicAnd()) { return new AtomicOpAnd<c0>(*reg0); - case MO_AOR: - case MO_ANROR: + } else if (isAtomicOr()) { return new AtomicOpOr<c0>(*reg0); - case MO_AXOR: - case MO_ANRXOR: + } else if (isAtomicXor()) { return new AtomicOpXor<c0>(*reg0); - case MO_ACAS: - case MO_ANRCAS: + } else if (isAtomicCAS()) { return new AtomicOpCAS<c0>(*reg0, *reg1, cu); - case MO_AEXCH: - case MO_ANREXCH: + } else if (isAtomicExch()) { return new AtomicOpExch<c0>(*reg0); - case MO_AADD: - case MO_ANRADD: + } else if (isAtomicAdd()) { return new AtomicOpAdd<c0>(*reg0); - case MO_ASUB: - case MO_ANRSUB: + } else if (isAtomicSub()) { return new AtomicOpSub<c0>(*reg0); - case MO_AINC: - case MO_ANRINC: + } else if (isAtomicInc()) { return new AtomicOpInc<c0>(); - case MO_ADEC: - case MO_ANRDEC: + } else if (isAtomicDec()) { return new AtomicOpDec<c0>(); - case MO_AMAX: - case MO_ANRMAX: + } else if (isAtomicMax()) { return new AtomicOpMax<c0>(*reg0); - case MO_AMIN: - case MO_ANRMIN: + } else if (isAtomicMin()) { return new AtomicOpMin<c0>(*reg0); - default: - panic("Unrecognized atomic operation"); + } else { + fatal("Unrecognized atomic operation"); } } @@ -359,88 +380,58 @@ class GPUDynInst : public GPUExecContext setRequestFlags(Request *req, bool setMemOrder=true) { // currently these are the easy scopes to deduce - switch (s_type) { - case SEG_PRIVATE: + if (isPrivateSeg()) { req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); - break; - case SEG_SPILL: + } else if (isSpillSeg()) { req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); - break; - case SEG_GLOBAL: + } else if (isGlobalSeg()) { req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); - break; - case SEG_READONLY: + } else if (isReadOnlySeg()) { req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); - break; - case SEG_SHARED: + } else if (isGroupSeg()) { req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); - break; - case SEG_FLAT: + } else if (isFlat()) { // TODO: translate to correct scope assert(false); - default: - panic("Bad segment type"); - break; + } else { + fatal("%s has bad segment type\n", disassemble()); } - switch (scope) { - case Enums::MEMORY_SCOPE_NONE: - case Enums::MEMORY_SCOPE_WORKITEM: - break; - case Enums::MEMORY_SCOPE_WAVEFRONT: + if (isWavefrontScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WAVEFRONT_SCOPE); - break; - case Enums::MEMORY_SCOPE_WORKGROUP: + } else if (isWorkgroupScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WORKGROUP_SCOPE); - break; - case Enums::MEMORY_SCOPE_DEVICE: + } else if (isDeviceScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::DEVICE_SCOPE); - break; - case Enums::MEMORY_SCOPE_SYSTEM: + } else if (isSystemScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::SYSTEM_SCOPE); - break; - default: - panic("Bad scope type"); - break; + } else if (!isNoScope() && !isWorkitemScope()) { + fatal("%s has bad scope type\n", disassemble()); } if (setMemOrder) { // set acquire and release flags - switch (memoryOrder){ - case Enums::MEMORY_ORDER_SC_ACQUIRE: + if (isAcquire()) { req->setFlags(Request::ACQUIRE); - break; - case Enums::MEMORY_ORDER_SC_RELEASE: + } else if (isRelease()) { req->setFlags(Request::RELEASE); - break; - case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + } else if (isAcquireRelease()) { req->setFlags(Request::ACQUIRE | Request::RELEASE); - break; - default: - break; + } else if (!isNoOrder()) { + fatal("%s has bad memory order\n", disassemble()); } } // set atomic type // currently, the instruction genenerator only produces atomic return // but a magic instruction can produce atomic no return - if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || - m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || - m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || - m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || - m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || - m_op == Enums::MO_ACAS) { + if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); - } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || - m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || - m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || - m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || - m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || - m_op == Enums::MO_ANRCAS) { + } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } } @@ -457,7 +448,7 @@ class GPUDynInst : public GPUExecContext std::vector<int> tlbHitLevel; private: - GPUStaticInst *staticInst; + GPUStaticInst *_staticInst; uint64_t _seqNum; }; diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 83b429e62..0f74bd532 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -36,10 +36,12 @@ #include "gpu-compute/gpu_static_inst.hh" GPUStaticInst::GPUStaticInst(const std::string &opcode) - : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), - _instNum(0), _scalarOp(false) + : executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0) { + setFlag(NoOrder); } + const std::string& GPUStaticInst::disassemble() { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 911e4f308..a73ec12e3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -48,7 +48,7 @@ #include <cstdint> #include <string> -#include "enums/OpType.hh" +#include "enums/GPUStaticInstFlags.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/misc.hh" @@ -57,7 +57,7 @@ class BaseOperand; class BaseRegOperand; class Wavefront; -class GPUStaticInst +class GPUStaticInst : public GPUStaticInstFlags { public: GPUStaticInst(const std::string &opcode); @@ -86,22 +86,110 @@ class GPUStaticInst virtual bool isValid() const = 0; - /* - * Most instructions (including all HSAIL instructions) - * are vector ops, so _scalarOp will be false by default. - * Derived instruction objects that are scalar ops must - * set _scalarOp to true in their constructors. - */ - bool scalarOp() const { return _scalarOp; } + bool isALU() const { return _flags[ALU]; } + bool isBranch() const { return _flags[Branch]; } + bool isNop() const { return _flags[Nop]; } + bool isReturn() const { return _flags[Return]; } + + bool + isUnconditionalJump() const + { + return _flags[UnconditionalJump]; + } + + bool isSpecialOp() const { return _flags[SpecialOp]; } + bool isWaitcnt() const { return _flags[Waitcnt]; } + + bool isBarrier() const { return _flags[MemBarrier]; } + bool isMemFence() const { return _flags[MemFence]; } + bool isMemRef() const { return _flags[MemoryRef]; } + bool isFlat() const { return _flags[Flat]; } + bool isLoad() const { return _flags[Load]; } + bool isStore() const { return _flags[Store]; } + + bool + isAtomic() const + { + return _flags[AtomicReturn] || _flags[AtomicNoReturn]; + } + + bool isAtomicNoRet() const { return _flags[AtomicNoReturn]; } + bool isAtomicRet() const { return _flags[AtomicReturn]; } + + bool isScalar() const { return _flags[Scalar]; } + bool readsSCC() const { return _flags[ReadsSCC]; } + bool writesSCC() const { return _flags[WritesSCC]; } + bool readsVCC() const { return _flags[ReadsVCC]; } + bool writesVCC() const { return _flags[WritesVCC]; } - virtual bool isLocalMem() const + bool isAtomicAnd() const { return _flags[AtomicAnd]; } + bool isAtomicOr() const { return _flags[AtomicOr]; } + bool isAtomicXor() const { return _flags[AtomicXor]; } + bool isAtomicCAS() const { return _flags[AtomicCAS]; } + bool isAtomicExch() const { return _flags[AtomicExch]; } + bool isAtomicAdd() const { return _flags[AtomicAdd]; } + bool isAtomicSub() const { return _flags[AtomicSub]; } + bool isAtomicInc() const { return _flags[AtomicInc]; } + bool isAtomicDec() const { return _flags[AtomicDec]; } + bool isAtomicMax() const { return _flags[AtomicMax]; } + bool isAtomicMin() const { return _flags[AtomicMin]; } + + bool + isArgLoad() const + { + return (_flags[KernArgSegment] || _flags[ArgSegment]) && _flags[Load]; + } + + bool + isGlobalMem() const { - fatal("calling isLocalMem() on non-memory instruction.\n"); + return _flags[MemoryRef] && (_flags[GlobalSegment] || + _flags[PrivateSegment] || _flags[ReadOnlySegment] || + _flags[SpillSegment]); + } - return false; + bool + isLocalMem() const + { + return _flags[MemoryRef] && _flags[GroupSegment]; } - bool isArgLoad() { return false; } + bool isArgSeg() const { return _flags[ArgSegment]; } + bool isGlobalSeg() const { return _flags[GlobalSegment]; } + bool isGroupSeg() const { return _flags[GroupSegment]; } + bool isKernArgSeg() const { return _flags[KernArgSegment]; } + bool isPrivateSeg() const { return _flags[PrivateSegment]; } + bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; } + bool isSpillSeg() const { return _flags[SpillSegment]; } + + bool isWorkitemScope() const { return _flags[WorkitemScope]; } + bool isWavefrontScope() const { return _flags[WavefrontScope]; } + bool isWorkgroupScope() const { return _flags[WorkgroupScope]; } + bool isDeviceScope() const { return _flags[DeviceScope]; } + bool isSystemScope() const { return _flags[SystemScope]; } + bool isNoScope() const { return _flags[NoScope]; } + + bool isRelaxedOrder() const { return _flags[RelaxedOrder]; } + bool isAcquire() const { return _flags[Acquire]; } + bool isRelease() const { return _flags[Release]; } + bool isAcquireRelease() const { return _flags[AcquireRelease]; } + bool isNoOrder() const { return _flags[NoOrder]; } + + /** + * Coherence domain of a memory instruction. Only valid for + * machine ISA. The coherence domain specifies where it is + * possible to perform memory synchronization, e.g., acquire + * or release, from the shader kernel. + * + * isGloballyCoherent(): returns true if kernel is sharing memory + * with other work-items on the same device (GPU) + * + * isSystemCoherent(): returns true if kernel is sharing memory + * with other work-items on a different device (GPU) or the host (CPU) + */ + bool isGloballyCoherent() const { return _flags[GloballyCoherent]; } + bool isSystemCoherent() const { return _flags[SystemCoherent]; } + virtual uint32_t instSize() = 0; // only used for memory instructions @@ -120,22 +208,13 @@ class GPUStaticInst virtual uint32_t getTargetPc() { return 0; } - /** - * Query whether the instruction is an unconditional jump i.e., the jump - * is always executed because there is no condition to be evaluated. - * - * If the instruction is not of branch type, the result is always false. - * - * @return True if the instruction is an unconditional jump. - */ - virtual bool unconditionalJumpInstruction() { return false; } - static uint64_t dynamic_id_count; - Enums::OpType o_type; // For flat memory accesses Enums::StorageClassType executed_as; + void setFlag(Flags flag) { _flags[flag] = true; } + protected: virtual void execLdAcq(GPUDynInstPtr gpuDynInst) @@ -169,7 +248,45 @@ class GPUStaticInst */ int _ipdInstNum; - bool _scalarOp; + std::bitset<Num_Flags> _flags; +}; + +class KernelLaunchStaticInst : public GPUStaticInst +{ + public: + KernelLaunchStaticInst() : GPUStaticInst("kernel_launch") + { + setFlag(Nop); + setFlag(Scalar); + setFlag(Acquire); + setFlag(SystemScope); + setFlag(GlobalSegment); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + fatal("kernel launch instruction should not be executed\n"); + } + + void + generateDisassembly() + { + disassembly = opcode; + } + + int getNumOperands() { return 0; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isVectorRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return 0; } + int numDstRegOperands() { return 0; } + int numSrcRegOperands() { return 0; } + bool isValid() const { return true; } + uint32_t instSize() { return 0; } }; #endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc index 10ded11b7..ac6a81b16 100644 --- a/src/gpu-compute/kernel_cfg.cc +++ b/src/gpu-compute/kernel_cfg.cc @@ -104,7 +104,7 @@ ControlFlowInfo::createBasicBlocks() leaders.insert(0); for (int i = 1; i < instructions.size(); i++) { GPUStaticInst* instruction = instructions[i]; - if (instruction->o_type == Enums::OT_BRANCH) { + if (instruction->isBranch()) { const int target_pc = instruction->getTargetPc(); leaders.insert(target_pc); leaders.insert(i + 1); @@ -137,18 +137,18 @@ ControlFlowInfo::connectBasicBlocks() break; } GPUStaticInst* last = lastInstruction(bb.get()); - if (last->o_type == Enums::OT_RET) { + if (last->isReturn()) { bb->successorIds.insert(exit_bb->id); continue; } - if (last->o_type == Enums::OT_BRANCH) { + if (last->isBranch()) { const uint32_t target_pc = last->getTargetPc(); BasicBlock* target_bb = basicBlock(target_pc); bb->successorIds.insert(target_bb->id); } // Unconditional jump instructions have a unique successor - if (!last->unconditionalJumpInstruction()) { + if (!last->isUnconditionalJump()) { BasicBlock* next_bb = basicBlock(last->instNum() + 1); bb->successorIds.insert(next_bb->id); } @@ -274,7 +274,7 @@ ControlFlowInfo::printBasicBlocks() const int inst_num = inst->instNum(); std::cout << inst_num << " [" << basicBlock(inst_num)->id << "]: " << inst->disassemble(); - if (inst->o_type == Enums::OT_BRANCH) { + if (inst->isBranch()) { std::cout << ", PC = " << inst->getTargetPc(); } std::cout << std::endl; diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index d4a27318a..fad98c886 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -141,8 +141,7 @@ LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, } } - if (gpuDynInst->m_op == Enums::MO_LD || - gpuDynInst->m_op == Enums::MO_ST) { + if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { // mask identical addresses for (int j = 0; j < numBanks; ++j) { for (int j0 = 0; j0 < j; j0++) { @@ -208,8 +207,8 @@ LdsState::processPacket(PacketPtr packet) GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead - int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : - (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : + (dynInst->isStore()) ? parent->storeBusLength() : parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 58d109493..5fcbe82c0 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -43,7 +43,6 @@ #include <utility> #include <vector> -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/misc.hh" #include "mem/mem_object.hh" diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index e2238bf45..80dad6fcd 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,7 +62,7 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -137,7 +137,7 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector<uint32_t> regVec; for (int k = 0; k < m->n_reg; ++k) { int dst = m->dst_reg+k; @@ -172,13 +172,12 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) // Decrement outstanding request count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) - || MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, m->time, -1); } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index c1f741d6a..13afab977 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -47,7 +47,6 @@ #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_tlb.hh" diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index c43d765af..c50c06cc6 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -38,7 +38,6 @@ #include <string> #include "base/misc.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -153,8 +152,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const void VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) { - bool loadInstr = IS_OT_READ(ii->opType()); - bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + bool loadInstr = ii->isLoad(); + bool atomicInstr = ii->isAtomic() || ii->isMemFence(); bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index c677cbe41..caeed85a7 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -37,7 +37,6 @@ #include "debug/GPUExec.hh" #include "debug/WavefrontStack.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -165,19 +164,8 @@ Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) bool Wavefront::isGmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType())) { + if (ii->isGlobalMem() || ii->isFlat()) return true; - } - - if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType())) { - return true; - } - - if (IS_OT_FLAT(ii->opType())) { - return true; - } return false; } @@ -185,8 +173,7 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii) bool Wavefront::isLmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || - IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLocalMem()) { return true; } @@ -199,10 +186,9 @@ Wavefront::isOldestInstALU() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ)) { + if (status != S_STOPPED && (ii->isNop() || + ii->isReturn() || ii->isBranch() || + ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -215,7 +201,7 @@ Wavefront::isOldestInstBarrier() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + if (status != S_STOPPED && ii->isBarrier()) { return true; } @@ -228,9 +214,7 @@ Wavefront::isOldestInstGMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { - + if (status != S_STOPPED && ii->isGlobalMem()) { return true; } @@ -243,9 +227,7 @@ Wavefront::isOldestInstLMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { - + if (status != S_STOPPED && ii->isLocalMem()) { return true; } @@ -258,9 +240,7 @@ Wavefront::isOldestInstPrivMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - + if (status != S_STOPPED && ii->isPrivateSeg()) { return true; } @@ -273,8 +253,7 @@ Wavefront::isOldestInstFlatMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { - + if (status != S_STOPPED && ii->isFlat()) { return true; } @@ -289,7 +268,7 @@ Wavefront::instructionBufferHasBranch() for (auto it : instructionBuffer) { GPUDynInstPtr ii = it; - if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + if (ii->isReturn() || ii->isBranch()) { return true; } } @@ -371,23 +350,16 @@ Wavefront::ready(itype_e type) // checking readiness will be fixed eventually. In the meantime, let's // make sure that we do not silently let an instruction type slip // through this logic and always return not ready. - if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || + ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || + ii->isMemFence() || ii->isFlat())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); } DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); - if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + if (type == I_ALU && ii->isBarrier()) { // Here for ALU instruction (barrier) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -400,7 +372,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + } else if (type == I_ALU && ii->isNop()) { // Here for ALU instruction (nop) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -408,7 +380,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + } else if (type == I_ALU && ii->isReturn()) { // Here for ALU instruction (return) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -421,10 +393,10 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG)) { + } else if (type == I_ALU && (ii->isBranch() || + ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg())) { // Here for ALU instruction (all others) if (!computeUnit->wfWait[simdId].prerdy()) { // Is alu slot free? @@ -439,18 +411,16 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + } else if (type == I_GLOBAL && ii->isGlobalMem()) { // Here Global memory instruction - if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory write requests? if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || - IS_OT_HIST_GM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory read requests? if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) return 0; @@ -480,17 +450,15 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + } else if (type == I_SHARED && ii->isLocalMem()) { // Here for Shared memory instruction - if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_HIST_LM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { return 0; } @@ -519,47 +487,7 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - // Here for Private memory instruction ------------------------ // - if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { - if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { - return 0; - } - } - - if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || - IS_OT_HIST_PM(ii->opType())) { - if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { - return 0; - } - } - - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - if (!glbMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + } else if (type == I_FLAT && ii->isFlat()) { if (!glbMemBusRdy) { // Is there an available VRF->Global memory read bus? return 0; @@ -618,23 +546,22 @@ Wavefront::updateResources() assert(ii); computeUnit->vrf[simdId]->updateResources(this, ii); // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType()==Enums::OT_KERN_READ || - ii->opType()==Enums::OT_ARG || - ii->opType()==Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].preset(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; rdGmReqsInPipe++; @@ -649,7 +576,7 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; wrGmReqsInPipe++; @@ -664,21 +591,21 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { memReqsInPipe++; rdGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; rdGmReqsInPipe++; @@ -686,21 +613,21 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { memReqsInPipe++; rdLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; rdLmReqsInPipe++; @@ -708,28 +635,6 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_PM(ii->opType())) { - memReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } } @@ -751,7 +656,7 @@ Wavefront::exec() DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); - ii->execute(); + ii->execute(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands()); @@ -785,24 +690,24 @@ Wavefront::exec() // ---- Update Vector ALU pipeline and other resources ------------------ // // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - ii->opType() == Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].set(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { @@ -816,7 +721,7 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. @@ -829,32 +734,32 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. |