diff options
Diffstat (limited to 'src')
148 files changed, 52249 insertions, 80 deletions
diff --git a/src/SConscript b/src/SConscript index 322212cb7..2bac0bff3 100755 --- a/src/SConscript +++ b/src/SConscript @@ -78,7 +78,7 @@ class SourceMeta(type): def __init__(cls, name, bases, dict): super(SourceMeta, cls).__init__(name, bases, dict) cls.all = [] - + def get(cls, **guards): '''Find all files that match the specified guards. If a source file does not specify a flag, the default is False''' @@ -367,9 +367,9 @@ def makeTheISA(source, target, env): target_isa = env['TARGET_ISA'] def define(isa): return isa.upper() + '_ISA' - + def namespace(isa): - return isa[0].upper() + isa[1:].lower() + 'ISA' + return isa[0].upper() + isa[1:].lower() + 'ISA' code = code_formatter() @@ -407,6 +407,51 @@ def makeTheISA(source, target, env): env.Command('config/the_isa.hh', map(Value, all_isa_list), MakeAction(makeTheISA, Transform("CFG ISA", 0))) +def makeTheGPUISA(source, target, env): + isas = [ src.get_contents() for src in source ] + target_gpu_isa = env['TARGET_GPU_ISA'] + def define(isa): + return isa.upper() + '_ISA' + + def namespace(isa): + return isa[0].upper() + isa[1:].lower() + 'ISA' + + + code = code_formatter() + code('''\ +#ifndef __CONFIG_THE_GPU_ISA_HH__ +#define __CONFIG_THE_GPU_ISA_HH__ + +''') + + # create defines for the preprocessing and compile-time determination + for i,isa in enumerate(isas): + code('#define $0 $1', define(isa), i + 1) + code() + + # create an enum for any run-time determination of the ISA, we + # reuse the same name as the namespaces + code('enum class GPUArch {') + for i,isa in enumerate(isas): + if i + 1 == len(isas): + code(' $0 = $1', namespace(isa), define(isa)) + else: + code(' $0 = $1,', namespace(isa), define(isa)) + code('};') + + code(''' + +#define THE_GPU_ISA ${{define(target_gpu_isa)}} +#define TheGpuISA ${{namespace(target_gpu_isa)}} +#define THE_GPU_ISA_STR "${{target_gpu_isa}}" + +#endif // __CONFIG_THE_GPU_ISA_HH__''') + + code.write(str(target[0])) + +env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list), + MakeAction(makeTheGPUISA, Transform("CFG ISA", 0))) + ######################################################################## # # Prevent any SimObjects from being added after this point, they @@ -784,7 +829,7 @@ extern "C" { EmbeddedSwig embed_swig_${module}(init_${module}); ''') code.write(str(target[0])) - + # Build all swig modules for swig in SwigSource.all: env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode, @@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = { x = array.array('B', data[i:i+step]) code(''.join('%d,' % d for d in x)) code.dedent() - + code('''}; EmbeddedPython embedded_${sym}( diff --git a/src/arch/SConscript b/src/arch/SConscript index e0d6845f5..b022cb01f 100644 --- a/src/arch/SConscript +++ b/src/arch/SConscript @@ -68,6 +68,14 @@ isa_switch_hdrs = Split(''' # Set up this directory to support switching headers make_switching_dir('arch', isa_switch_hdrs, env) +if env['BUILD_GPU']: + gpu_isa_switch_hdrs = Split(''' + gpu_decoder.hh + gpu_types.hh + ''') + + make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env) + ################################################################# # # Include architecture-specific files. diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h new file mode 100644 index 000000000..b260157ab --- /dev/null +++ b/src/arch/hsail/Brig.h @@ -0,0 +1,67 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. +#ifndef INTERNAL_BRIG_H +#define INTERNAL_BRIG_H + +#include <stdint.h> + +namespace Brig { +#include "Brig_new.hpp" + +// These typedefs provide some backward compatibility with earlier versions +// of Brig.h, reducing the number of code changes. The distinct names also +// increase legibility by showing the code's intent. +typedef BrigBase BrigDirective; +typedef BrigBase BrigOperand; + +enum BrigMemoryFenceSegments { // for internal use only + //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc } + //.mnemo_token=_EMMemoryFenceSegments + //.mnemo_context=EInstModifierInstFenceContext + BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0, + BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1, + BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2, + BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip +}; + +} + +#endif // defined(INTERNAL_BRIG_H) diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp new file mode 100644 index 000000000..60e6f4dea --- /dev/null +++ b/src/arch/hsail/Brig_new.hpp @@ -0,0 +1,1587 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +//.ignore{ + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include <stdint.h> + +enum BrigAuxDefs { + MAX_OPERANDS_NUM = 6 +}; + +//} + +typedef uint32_t BrigVersion32_t; + +enum BrigVersion { + + //.nowrap + //.nodump + //.nollvm + + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint8_t BrigAlignment8_t; //.defValue=BRIG_ALIGNMENT_NONE + +typedef uint8_t BrigAllocation8_t; //.defValue=BRIG_ALLOCATION_NONE + +typedef uint8_t BrigAluModifier8_t; + +typedef uint8_t BrigAtomicOperation8_t; + +typedef uint32_t BrigCodeOffset32_t; //.defValue=0 //.wtype=ItemRef<Code> + +typedef uint8_t BrigCompareOperation8_t; + +typedef uint16_t BrigControlDirective16_t; + +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; //.wtype=ListRef<Code> //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; //.wtype=ListRef<Operand> //.defValue=0 + +typedef BrigDataOffset32_t BrigDataOffsetString32_t; //.wtype=StrRef //.defValue=0 + +typedef uint8_t BrigExecutableModifier8_t; + +typedef uint8_t BrigImageChannelOrder8_t; //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN + +typedef uint8_t BrigImageChannelType8_t; //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN + +typedef uint8_t BrigImageGeometry8_t; //.defValue=BRIG_GEOMETRY_UNKNOWN + +typedef uint8_t BrigImageQuery8_t; + +typedef uint16_t BrigKind16_t; + +typedef uint8_t BrigLinkage8_t; //.defValue=BRIG_LINKAGE_NONE + +typedef uint8_t BrigMachineModel8_t; //.defValue=BRIG_MACHINE_LARGE + +typedef uint8_t BrigMemoryModifier8_t; + +typedef uint8_t BrigMemoryOrder8_t; //.defValue=BRIG_MEMORY_ORDER_RELAXED + +typedef uint8_t BrigMemoryScope8_t; //.defValue=BRIG_MEMORY_SCOPE_SYSTEM + +typedef uint16_t BrigOpcode16_t; + +typedef uint32_t BrigOperandOffset32_t; //.defValue=0 //.wtype=ItemRef<Operand> + +typedef uint8_t BrigPack8_t; //.defValue=BRIG_PACK_NONE + +typedef uint8_t BrigProfile8_t; //.defValue=BRIG_PROFILE_FULL + +typedef uint16_t BrigRegisterKind16_t; + +typedef uint8_t BrigRound8_t; //.defValue=BRIG_ROUND_NONE + +typedef uint8_t BrigSamplerAddressing8_t; //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE + +typedef uint8_t BrigSamplerCoordNormalization8_t; + +typedef uint8_t BrigSamplerFilter8_t; + +typedef uint8_t BrigSamplerQuery8_t; + +typedef uint32_t BrigSectionIndex32_t; + +typedef uint8_t BrigSegCvtModifier8_t; + +typedef uint8_t BrigSegment8_t; //.defValue=BRIG_SEGMENT_NONE + +typedef uint32_t BrigStringOffset32_t; //.defValue=0 //.wtype=StrRef + +typedef uint16_t BrigType16_t; + +typedef uint8_t BrigVariableModifier8_t; + +typedef uint8_t BrigWidth8_t; + +typedef uint32_t BrigExceptions32_t; + +enum BrigKind { + + //.nollvm + // + //.wname={ s/^BRIG_KIND//; MACRO2Name($_) } + //.mnemo=$wname{ $wname } + // + //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" } + //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1" + // + //.isBodyOnly={ "false" } + //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()" + //.isBodyOnly_default="assert(false); return false" + // + //.isToplevelOnly={ "false" } + //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()" + //.isToplevelOnly_default="assert(false); return false" + + BRIG_KIND_NONE = 0x0000, //.skip + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, //.skip + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, //.isBodyOnly=true + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, //.isToplevelOnly=true + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, //.skip + + BRIG_KIND_INST_BEGIN = 0x2000, //.skip + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, //.skip + + BRIG_KIND_OPERAND_BEGIN = 0x3000, //.skip + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d //.skip +}; + +enum BrigAlignment { + + //.mnemo={ s/^BRIG_ALIGNMENT_//; lc } + //.mnemo_proto="const char* align2str(unsigned arg)" + // + //.bytes={ /(\d+)/ ? $1 : undef } + //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1" + // + //.rbytes=$bytes{ $bytes } + //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)" + //.rbytes_default="return BRIG_ALIGNMENT_LAST" + // + //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" } + + BRIG_ALIGNMENT_NONE = 0, //.no_mnemo + BRIG_ALIGNMENT_1 = 1, //.mnemo="" + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + + BRIG_ALIGNMENT_LAST, //.skip + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1 //.skip +}; + +enum BrigAllocation { + + //.mnemo={ s/^BRIG_ALLOCATION_//;lc } + //.mnemo_token=EAllocKind + + BRIG_ALLOCATION_NONE = 0, //.mnemo="" + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +enum BrigAtomicOperation { + + //.tdcaption="Atomic Operations" + // + //.mnemo={ s/^BRIG_ATOMIC_//;lc } + //.mnemo_token=_EMAtomicOp + //.mnemo_context=EInstModifierInstAtomicContext + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +enum BrigCompareOperation { + + //.tdcaption="Comparison Operators" + // + //.mnemo={ s/^BRIG_COMPARE_//;lc } + //.mnemo_token=_EMCompare + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +enum BrigControlDirective { + + //.mnemo={ s/^BRIG_CONTROL_//;lc } + //.mnemo_token=EControl + // + //.print=$mnemo{ $mnemo } + + BRIG_CONTROL_NONE = 0, //.skip + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +enum BrigExecutableModifierMask { + //.nodump + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +enum BrigImageChannelOrder { + + //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc } + //.mnemo_token=EImageOrder + //.mnemo_context=EImageOrderContext + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + // used internally + BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip + +}; + +enum BrigImageChannelType { + + //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc } + //.mnemo_token=EImageFormat + // + //.print=$mnemo{ $mnemo } + + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + // used internally + BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo="" + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageGeometry { + + //.tdcaption="Geometry" + // + //.mnemo={ s/^BRIG_GEOMETRY_//;lc } + //.mnemo_token=EImageGeometry + // + //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef} + //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo" + //.dim_default="assert(0); return 0" + // + //.depth={/DEPTH$/?"true":"false"} + //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo" + //.depth_default="return false" + + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + // used internally + BRIG_GEOMETRY_UNKNOWN, //.mnemo="" + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigImageQuery { + + //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6 +}; + +enum BrigLinkage { + + //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc } + + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +enum BrigMachineModel { + + //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc } + //.mnemo_token=ETargetMachine + // + //.print=$mnemo{ $mnemo } + + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, + + BRIG_MACHINE_UNDEF = 2 //.skip +}; + +enum BrigMemoryModifierMask { //.tddef=0 + BRIG_MEMORY_CONST = 1 +}; + +enum BrigMemoryOrder { + + //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc } + //.mnemo_token=_EMMemoryOrder + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_MEMORY_ORDER_NONE = 0, //.mnemo="" + BRIG_MEMORY_ORDER_RELAXED = 1, //.mnemo=rlx + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, //.mnemo=scacq + BRIG_MEMORY_ORDER_SC_RELEASE = 3, //.mnemo=screl + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, //.mnemo=scar + + BRIG_MEMORY_ORDER_LAST = 5 //.skip +}; + +enum BrigMemoryScope { + + //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc } + //.mnemo_token=_EMMemoryScope + // + //.print=$mnemo{ $mnemo } + + BRIG_MEMORY_SCOPE_NONE = 0, //.mnemo="" + BRIG_MEMORY_SCOPE_WORKITEM = 1, //.mnemo="" + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, //.mnemo=wave + BRIG_MEMORY_SCOPE_WORKGROUP = 3, //.mnemo=wg + BRIG_MEMORY_SCOPE_AGENT = 4, //.mnemo=agent + BRIG_MEMORY_SCOPE_SYSTEM = 5, //.mnemo=system + + BRIG_MEMORY_SCOPE_LAST = 6 //.skip +}; + +enum BrigOpcode { + + //.tdcaption="Instruction Opcodes" + // + //.k={ "BASIC" } + //.pscode=$k{ MACRO2Name("_".$k) } + //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" } + //.opcodeparser_incfile=ParserUtilities + //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic" + // + //.psopnd={undef} + //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" } + //.opndparser_incfile=ParserUtilities + //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands" + // + //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc } + //.mnemo_scanner=Instructions //.mnemo_token=EInstruction + //.mnemo_context=EDefaultContext + // + //.has_memory_order={undef} + //.semsupport=$has_memory_order{ return $has_memory_order && "true" } + // + //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; } + //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true" + // + //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" } + //.opcodevis_switch //.opcodevis_proto="template <typename RetType, typename Visitor> RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)" + //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()" + //.opcodevis_incfile=ItemUtils + // + //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef } + //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false" + // + //.vecOpndIndex={undef} + //.vecOpndIndex_switch //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1" + //.vecOpndIndex_incfile=ParserUtilities + // + //.numdst={undef} + //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1" + // + //.print=$mnemo{ $mnemo } + + BRIG_OPCODE_NOP = 0, //.k=NOP //.hasType=false + BRIG_OPCODE_ABS = 1, //.k=BASIC_OR_MOD + BRIG_OPCODE_ADD = 2, //.k=BASIC_OR_MOD + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, //.k=BASIC_OR_MOD + BRIG_OPCODE_COPYSIGN = 6, //.k=BASIC_OR_MOD + BRIG_OPCODE_DIV = 7, //.k=BASIC_OR_MOD + BRIG_OPCODE_FLOOR = 8, //.k=BASIC_OR_MOD + BRIG_OPCODE_FMA = 9, //.k=BASIC_OR_MOD + BRIG_OPCODE_FRACT = 10, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD = 11, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAX = 12, //.k=BASIC_OR_MOD + BRIG_OPCODE_MIN = 13, //.k=BASIC_OR_MOD + BRIG_OPCODE_MUL = 14, //.k=BASIC_OR_MOD + BRIG_OPCODE_MULHI = 15, //.k=BASIC_OR_MOD + BRIG_OPCODE_NEG = 16, //.k=BASIC_OR_MOD + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, //.k=BASIC_OR_MOD + BRIG_OPCODE_SQRT = 19, //.k=BASIC_OR_MOD + BRIG_OPCODE_SUB = 20, //.k=BASIC_OR_MOD + BRIG_OPCODE_TRUNC = 21, //.k=BASIC_OR_MOD + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, //.k=SOURCE_TYPE + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, //.k=SOURCE_TYPE + BRIG_OPCODE_LASTBIT = 39, //.k=SOURCE_TYPE + BRIG_OPCODE_COMBINE = 40, //.k=SOURCE_TYPE //.vecOpndIndex=1 + BRIG_OPCODE_EXPAND = 41, //.k=SOURCE_TYPE //.vecOpndIndex=0 + BRIG_OPCODE_LDA = 42, //.k=ADDR + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACK = 48, //.k=SOURCE_TYPE + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, //.k=SOURCE_TYPE + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, //.k=SOURCE_TYPE + BRIG_OPCODE_UNPACKCVT = 62, //.k=SOURCE_TYPE + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, //.k=SOURCE_TYPE + BRIG_OPCODE_SADHI = 65, //.k=SOURCE_TYPE + BRIG_OPCODE_SEGMENTP = 66, //.k=SEG_CVT + BRIG_OPCODE_FTOS = 67, //.k=SEG_CVT + BRIG_OPCODE_STOF = 68, //.k=SEG_CVT + BRIG_OPCODE_CMP = 69, //.k=CMP + BRIG_OPCODE_CVT = 70, //.k=CVT + BRIG_OPCODE_LD = 71, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_ST = 72, //.k=MEM //.has_memory_order //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_ATOMIC = 73, //.k=ATOMIC + BRIG_OPCODE_ATOMICNORET = 74, //.k=ATOMIC //.numdst=0 + BRIG_OPCODE_SIGNAL = 75, //.k=SIGNAL + BRIG_OPCODE_SIGNALNORET = 76, //.k=SIGNAL //.numdst=0 + BRIG_OPCODE_MEMFENCE = 77, //.k=MEM_FENCE //.numdst=0 + BRIG_OPCODE_RDIMAGE = 78, //.k=IMAGE //.vecOpndIndex=0 + BRIG_OPCODE_LDIMAGE = 79, //.k=IMAGE //.vecOpndIndex=0 + BRIG_OPCODE_STIMAGE = 80, //.k=IMAGE //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_IMAGEFENCE = 81, //.k=BASIC_NO_TYPE + BRIG_OPCODE_QUERYIMAGE = 82, //.k=QUERY_IMAGE + BRIG_OPCODE_QUERYSAMPLER = 83, //.k=QUERY_SAMPLER + BRIG_OPCODE_CBR = 84, //.k=BR //.numdst=0 + BRIG_OPCODE_BR = 85, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_SBR = 86, //.k=BR //.numdst=0 //.psopnd=SbrOperands + BRIG_OPCODE_BARRIER = 87, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_WAVEBARRIER = 88, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_ARRIVEFBAR = 89, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_INITFBAR = 90, //.k=BASIC_NO_TYPE //.numdst=0 //.hasType=false + BRIG_OPCODE_JOINFBAR = 91, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_LEAVEFBAR = 92, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_RELEASEFBAR = 93, //.k=BASIC_NO_TYPE //.numdst=0 + BRIG_OPCODE_WAITFBAR = 94, //.k=BR //.numdst=0 //.hasType=false + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, //.k=LANE + BRIG_OPCODE_ACTIVELANEID = 97, //.k=LANE + BRIG_OPCODE_ACTIVELANEMASK = 98, //.k=LANE //.vecOpndIndex=0 + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, //.k=LANE + BRIG_OPCODE_CALL = 100, //.k=BR //.psopnd=CallOperands //.numdst=0 //.hasType=false + BRIG_OPCODE_SCALL = 101, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_ICALL = 102, //.k=BR //.psopnd=CallOperands //.numdst=0 + BRIG_OPCODE_RET = 103, //.k=BASIC_NO_TYPE + BRIG_OPCODE_ALLOCA = 104, //.k=MEM + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, //.numdst=0 + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, //.numdst=0 + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, //.k=QUEUE + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, //.k=QUEUE + BRIG_OPCODE_LDQUEUEREADINDEX = 123, //.k=QUEUE + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, //.k=QUEUE + BRIG_OPCODE_STQUEUEREADINDEX = 125, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, //.k=QUEUE //.numdst=0 + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, //.numdst=0 + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, //.k=SEG + BRIG_OPCODE_WAVEID = 136, + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip + + BRIG_OPCODE_GCNMADU = (1u << 15) | 0, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNMADS = (1u << 15) | 1, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2, + BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3, + BRIG_OPCODE_GCNMED3 = (1u << 15) | 4, + BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNBFM = (1u << 15) | 9, + BRIG_OPCODE_GCNLD = (1u << 15) | 10, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_GCNST = (1u << 15) | 11, //.k=MEM //.has_memory_order //.vecOpndIndex=0 + BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12, //.k=ATOMIC + BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13, //.k=ATOMIC //.mnemo=gcn_atomicNoRet + BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14, + BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15, + BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16, //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc + BRIG_OPCODE_GCNMSAD = (1u << 15) | 17, + BRIG_OPCODE_GCNQSAD = (1u << 15) | 18, + BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19, + BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20, //.k=BASIC_NO_TYPE + BRIG_OPCODE_GCNSADW = (1u << 15) | 21, + BRIG_OPCODE_GCNSADD = (1u << 15) | 22, + BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23, //.k=ADDR //.mnemo=gcn_atomic_consume + BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24, //.k=ADDR //.mnemo=gcn_atomic_append + BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25, //.mnemo=gcn_b4xchg + BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26, //.mnemo=gcn_b32xchg + BRIG_OPCODE_GCNMAX = (1u << 15) | 27, + BRIG_OPCODE_GCNMIN = (1u << 15) | 28, + BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29, //.k=BASIC_OR_MOD + BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30, + + BRIG_OPCODE_AMDRDIMAGELOD = (1u << 15) | 31, //.k=IMAGE //.mnemo=amd_rdimagelod //.vecOpndIndex=0 + BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32, //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0 + BRIG_OPCODE_AMDLDIMAGEMIP = (1u << 15) | 33, //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0 + BRIG_OPCODE_AMDSTIMAGEMIP = (1u << 15) | 34, //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0 + BRIG_OPCODE_AMDQUERYIMAGE = (1u << 15) | 35 //.k=QUERY_IMAGE //.mnemo=amd_queryimage +}; + +enum BrigPack { + + //.tdcaption="Packing" + // + //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc } + //.mnemo_token=_EMPacking + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_PACK_NONE = 0, //.mnemo="" + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +enum BrigProfile { + + //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc } + //.mnemo_token=ETargetProfile + // + //.print=$mnemo{ $mnemo } + + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, + + BRIG_PROFILE_UNDEF = 2 //.skip +}; + +enum BrigRegisterKind { + + //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) } + // + //.bits={ } + //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1" + // + //.nollvm + + BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1 + BRIG_REGISTER_KIND_SINGLE = 1, //.bits=32 + BRIG_REGISTER_KIND_DOUBLE = 2, //.bits=64 + BRIG_REGISTER_KIND_QUAD = 3 //.bits=128 +}; + +enum BrigRound { + + //.mnemo={} + //.mnemo_fn=round2str //.mnemo_token=_EMRound + // + //.sat={/_SAT$/? "true" : "false"} + //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding" + //.sat_default="return false" + // + //.sig={/_SIGNALING_/? "true" : "false"} + //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding" + //.sig_default="return false" + // + //.int={/_INTEGER_/? "true" : "false"} + //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding" + //.int_default="return false" + // + //.flt={/_FLOAT_/? "true" : "false"} + //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding" + //.flt_default="return false" + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_ROUND_NONE = 0, //.no_mnemo + BRIG_ROUND_FLOAT_DEFAULT = 1, //.no_mnemo + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, //.mnemo=near + BRIG_ROUND_FLOAT_ZERO = 3, //.mnemo=zero + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, //.mnemo=up + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, //.mnemo=down + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, //.mnemo=neari + BRIG_ROUND_INTEGER_ZERO = 7, //.mnemo=zeroi + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, //.mnemo=upi + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, //.mnemo=downi + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, //.mnemo=neari_sat + BRIG_ROUND_INTEGER_ZERO_SAT = 11, //.mnemo=zeroi_sat + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, //.mnemo=upi_sat + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, //.mnemo=downi_sat + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, //.mnemo=sneari + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, //.mnemo=szeroi + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, //.mnemo=supi + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, //.mnemo=sdowni + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, //.mnemo=sneari_sat + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, //.mnemo=szeroi_sat + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, //.mnemo=supi_sat + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 //.mnemo=sdowni_sat +}; + +enum BrigSamplerAddressing { + + //.mnemo={ s/^BRIG_ADDRESSING_//;lc } + //.mnemo_token=ESamplerAddressingMode + + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerCoordNormalization { + + //.mnemo={ s/^BRIG_COORD_//;lc } + //.mnemo_token=ESamplerCoord + // + //.print=$mnemo{ $mnemo } + + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +enum BrigSamplerFilter { + + //.mnemo={ s/^BRIG_FILTER_//;lc } + // + //.print=$mnemo{ $mnemo } + + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip +}; + +enum BrigSamplerQuery { + + //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc } + //.mnemo_token=_EMSamplerQuery + // + //.print=$mnemo{ $mnemo } + + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +enum BrigSectionIndex { + + //.nollvm + // + //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc } + + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, + + // used internally + BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip +}; + +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 //.mnemo="nonull" //.print="_nonull" +}; + +enum BrigSegment { + + //.mnemo={ s/^BRIG_SEGMENT_//;lc} + //.mnemo_token=_EMSegment + //.mnemo_context=EInstModifierContext + // + //.print=$mnemo{ $mnemo ? "_$mnemo" : "" } + + BRIG_SEGMENT_NONE = 0, //.mnemo="" + BRIG_SEGMENT_FLAT = 1, //.mnemo="" + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip + + BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region" +}; + +enum BrigPackedTypeBits { + + //.nodump + // + //.nollvm + + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +enum BrigType { + + //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef } + //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0" + //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef } + //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0" + // + //.mnemo={ s/^BRIG_TYPE_//;lc } + //.mnemo_token=_EMType + // + //.array={/ARRAY$/?"true":"false"} + //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type" + //.array_default="return false" + // + //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"} + //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type" + //.a2e_default="return BRIG_TYPE_NONE" + // + //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"} + //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type" + //.e2a_default="return BRIG_TYPE_NONE" + // + //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc} + //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type" + //.t2s_default="return NULL" + // + //.dispatch_switch //.dispatch_incfile=TemplateUtilities + //.dispatch_proto="template<typename RetType, typename Visitor>\nRetType dispatchByType_gen(unsigned type, Visitor& v)" + //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" } + //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)" + // + //- .tdname=BrigType + // + //.print=$mnemo{ "_$mnemo" } + + BRIG_TYPE_NONE = 0, //.mnemo="" //.print="" + BRIG_TYPE_U8 = 1, //.ctype=uint8_t + BRIG_TYPE_U16 = 2, //.ctype=uint16_t + BRIG_TYPE_U32 = 3, //.ctype=uint32_t + BRIG_TYPE_U64 = 4, //.ctype=uint64_t + BRIG_TYPE_S8 = 5, //.ctype=int8_t + BRIG_TYPE_S16 = 6, //.ctype=int16_t + BRIG_TYPE_S32 = 7, //.ctype=int32_t + BRIG_TYPE_S64 = 8, //.ctype=int64_t + BRIG_TYPE_F16 = 9, //.ctype=f16_t + BRIG_TYPE_F32 = 10, //.ctype=float + BRIG_TYPE_F64 = 11, //.ctype=double + BRIG_TYPE_B1 = 12, //.ctype=bool //.numBytes=1 + BRIG_TYPE_B8 = 13, //.ctype=uint8_t + BRIG_TYPE_B16 = 14, //.ctype=uint16_t + BRIG_TYPE_B32 = 15, //.ctype=uint32_t + BRIG_TYPE_B64 = 16, //.ctype=uint64_t + BRIG_TYPE_B128 = 17, //.ctype=b128_t + BRIG_TYPE_SAMP = 18, //.mnemo=samp //.numBits=64 + BRIG_TYPE_ROIMG = 19, //.mnemo=roimg //.numBits=64 + BRIG_TYPE_WOIMG = 20, //.mnemo=woimg //.numBits=64 + BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg //.numBits=64 + BRIG_TYPE_SIG32 = 22, //.mnemo=sig32 //.numBits=64 + BRIG_TYPE_SIG64 = 23, //.mnemo=sig64 //.numBits=64 + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, //.ctype=uint8_t + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, //.ctype=uint8_t + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, //.ctype=uint8_t + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, //.ctype=uint16_t + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, //.ctype=uint16_t + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, //.ctype=uint32_t + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, //.ctype=int8_t + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, //.ctype=int8_t + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, //.ctype=int8_t + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, //.ctype=int16_t + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, //.ctype=int16_t + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, //.ctype=int32_t + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, //.ctype=f16_t + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, //.ctype=f16_t + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, //.ctype=float + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print="" + + // Used internally + BRIG_TYPE_INVALID = (unsigned) -1 //.skip +}; + +enum BrigVariableModifierMask { + + //.nodump + + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +enum BrigWidth { + + //.tddef=1 + // + //.print={ s/^BRIG_WIDTH_//; "_width($_)" } + + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, + + BRIG_WIDTH_LAST //.skip +}; + +struct BrigUInt64 { //.isroot //.standalone + uint32_t lo; //.defValue=0 + uint32_t hi; //.defValue=0 + + //+hcode KLASS& operator=(uint64_t rhs); + //+hcode operator uint64_t(); + //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; } + //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); } +}; + +struct BrigAluModifier { //.isroot //.standalone + BrigAluModifier8_t allBits; //.defValue=0 + //^^ bool ftz; //.wtype=BitValRef<0> +}; + +struct BrigBase { //.nowrap + uint16_t byteCount; + BrigKind16_t kind; +}; + +//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE }; +//.alias Directive:Code { //.generic }; +//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND }; + +struct BrigData { + //.nowrap + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigExecutableModifier { //.isroot //.standalone + BrigExecutableModifier8_t allBits; //.defValue=0 + //^^ bool isDefinition; //.wtype=BitValRef<0> +}; + +struct BrigMemoryModifier { //.isroot //.standalone + BrigMemoryModifier8_t allBits; //.defValue=0 + //^^ bool isConst; //.wtype=BitValRef<0> +}; + +struct BrigSegCvtModifier { //.isroot //.standalone + BrigSegCvtModifier8_t allBits; //.defValue=0 + //^^ bool isNoNull; //.wtype=BitValRef<0> +}; + +struct BrigVariableModifier { //.isroot //.standalone + BrigVariableModifier8_t allBits; //.defValue=0 + + //^^ bool isDefinition; //.wtype=BitValRef<0> + //^^ bool isConst; //.wtype=BitValRef<1> +}; + +struct BrigDirectiveArgBlockEnd { + BrigBase base; +}; + +struct BrigDirectiveArgBlockStart { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { //.generic + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; //.defValue=0 + uint16_t inArgCount; //.defValue=0 + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier modifier; //.acc=subItem<ExecutableModifier> //.wtype=ExecutableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +//.alias DirectiveKernel:DirectiveExecutable { }; +//.alias DirectiveFunction:DirectiveExecutable { }; +//.alias DirectiveSignature:DirectiveExecutable { }; +//.alias DirectiveIndirectFunction:DirectiveExecutable { }; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier + BrigLinkage8_t linkage; + uint16_t reserved; //.defValue=0 +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; //.defValue=1 +}; + +struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + + //+hcode bool isArray(); + //+implcode inline bool KLASS::isArray() { return isArrayType(type()); } + + //+hcode unsigned elementType(); + //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); } + + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; //.acc=subItem<UInt64> //.wtype=UInt64 + BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; //.defValue=0 +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; //.wtype=ValRef<uint32_t> + BrigVersion32_t hsailMinor; //.wtype=ValRef<uint32_t> + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; + + //+hcode Operand operand(int index); + //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; } +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier modifier; //.acc=subItem<MemoryModifier> //.wtype=MemoryModifier + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; //.defValue=0 +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t imageQuery; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t samplerQuery; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; //.defValue=0 +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier modifier; //.acc=subItem<SegCvtModifier> //.wtype=SegCvtModifier +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; //.defValue=0 +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; //.wtype=ItemRef<DirectiveVariable> + BrigOperandOffset32_t reg; //.wtype=ItemRef<OperandRegister> + BrigUInt64 offset; //.acc=subItem<UInt64> //.wtype=UInt64 +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Code elements(int index); + //+implcode inline Code KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; //.defValue=0 + uint16_t reserved; //.defValue=0 + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; //.defValue=0 + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; //.defValue=0 + BrigUInt64 width; //.acc=subItem<UInt64> //.wtype=UInt64 + BrigUInt64 height; //.acc=subItem<UInt64> //.wtype=UInt64 + BrigUInt64 depth; //.acc=subItem<UInt64> //.wtype=UInt64 + BrigUInt64 array; //.acc=subItem<UInt64> //.wtype=UInt64 +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; + + //+hcode unsigned elementCount(); + //+implcode inline unsigned KLASS::elementCount() { return elements().size(); } + //+hcode Operand elements(int index); + //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; } +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; //.defValue=0 +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +//.ignore{ + +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +#define MODULE_IDENTIFICATION_LENGTH (8) + +struct BrigModuleHeader { + char identification[MODULE_IDENTIFICATION_LENGTH]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#endif // defined(INCLUDED_BRIG_H) +//} diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript new file mode 100644 index 000000000..3455823a6 --- /dev/null +++ b/src/arch/hsail/SConscript @@ -0,0 +1,54 @@ +# -*- mode:python -*- + +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +if not env['BUILD_GPU']: + Return() + +if env['TARGET_GPU_ISA'] == 'hsail': + env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'], + 'gen.py', '$SOURCE $TARGETS') + + Source('generic_types.cc') + Source('gpu_decoder.cc') + Source('insts/branch.cc') + Source('insts/gen_exec.cc') + Source('insts/gpu_static_inst.cc') + Source('insts/main.cc') + Source('insts/pseudo_inst.cc') + Source('insts/mem.cc') + Source('operand.cc') diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts new file mode 100644 index 000000000..641963c82 --- /dev/null +++ b/src/arch/hsail/SConsopts @@ -0,0 +1,40 @@ +# -*- mode:python -*- + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +all_gpu_isa_list.append('hsail') diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py new file mode 100755 index 000000000..f2996019b --- /dev/null +++ b/src/arch/hsail/gen.py @@ -0,0 +1,806 @@ +#! /usr/bin/python + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Steve Reinhardt +# + +import sys, re + +from m5.util import code_formatter + +if len(sys.argv) != 4: + print "Error: need 3 args (file names)" + sys.exit(0) + +header_code = code_formatter() +decoder_code = code_formatter() +exec_code = code_formatter() + +############### +# +# Generate file prologs (includes etc.) +# +############### + +header_code(''' +#include "arch/hsail/insts/decl.hh" +#include "base/bitfield.hh" +#include "gpu-compute/hsail_code.hh" +#include "gpu-compute/wavefront.hh" + +namespace HsailISA +{ +''') +header_code.indent() + +decoder_code(''' +#include "arch/hsail/gpu_decoder.hh" +#include "arch/hsail/insts/branch.hh" +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/gen_decl.hh" +#include "arch/hsail/insts/mem.hh" +#include "arch/hsail/insts/mem_impl.hh" +#include "gpu-compute/brig_object.hh" + +namespace HsailISA +{ + std::vector<GPUStaticInst*> Decoder::decodedInsts; + + GPUStaticInst* + Decoder::decode(MachInst machInst) + { + using namespace Brig; + + const BrigInstBase *ib = machInst.brigInstBase; + const BrigObject *obj = machInst.brigObj; + + switch(ib->opcode) { +''') +decoder_code.indent() +decoder_code.indent() + +exec_code(''' +#include "arch/hsail/insts/gen_decl.hh" +#include "base/intmath.hh" + +namespace HsailISA +{ +''') +exec_code.indent() + +############### +# +# Define code templates for class declarations (for header file) +# +############### + +# Basic header template for an instruction with no template parameters. +header_template_nodt = ''' +class $class_name : public $base_class +{ + public: + typedef $base_class Base; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +# Basic header template for an instruction with a single DataType +# template parameter. +header_template_1dt = ''' +template<typename DataType> +class $class_name : public $base_class<DataType> +{ + public: + typedef $base_class<DataType> Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +header_template_1dt_noexec = ''' +template<typename DataType> +class $class_name : public $base_class<DataType> +{ + public: + typedef $base_class<DataType> Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } +}; + +''' + +# Same as header_template_1dt, except the base class has a second +# template parameter NumSrcOperands to allow a variable number of +# source operands. Note that since this is implemented with an array, +# it only works for instructions where all sources are of the same +# type (like most arithmetics). +header_template_1dt_varsrcs = ''' +template<typename DataType> +class $class_name : public $base_class<DataType, $num_srcs> +{ + public: + typedef $base_class<DataType, $num_srcs> Base; + typedef typename DataType::CType CType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +# Header template for instruction with two DataType template +# parameters, one for the dest and one for the source. This is used +# by compare and convert. +header_template_2dt = ''' +template<typename DestDataType, class SrcDataType> +class $class_name : public $base_class<DestDataType, SrcDataType> +{ + public: + typedef $base_class<DestDataType, SrcDataType> Base; + typedef typename DestDataType::CType DestCType; + typedef typename SrcDataType::CType SrcCType; + + $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "$opcode") + { + } + + void execute(GPUDynInstPtr gpuDynInst); +}; + +''' + +header_templates = { + 'ArithInst': header_template_1dt_varsrcs, + 'CmovInst': header_template_1dt, + 'ClassInst': header_template_1dt, + 'ShiftInst': header_template_1dt, + 'ExtractInsertInst': header_template_1dt, + 'CmpInst': header_template_2dt, + 'CvtInst': header_template_2dt, + 'LdInst': '', + 'StInst': '', + 'SpecialInstNoSrc': header_template_nodt, + 'SpecialInst1Src': header_template_nodt, + 'SpecialInstNoSrcNoDest': '', +} + +############### +# +# Define code templates for exec functions +# +############### + +# exec function body +exec_template_nodt_nosrc = ''' +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef Base::DestCType DestCType; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestCType dest_val = $expr; + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_nodt_1src = ''' +void +$class_name::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef Base::DestCType DestCType; + typedef Base::SrcCType SrcCType; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + SrcCType src_val0 = this->src0.get<SrcCType>(w, lane); + DestCType dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_varsrcs = ''' +template<typename DataType> +void +$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + if ($dest_is_src_flag) { + dest_val = this->dest.template get<CType>(w, lane); + } + + CType src_val[$num_srcs]; + + for (int i = 0; i < $num_srcs; ++i) { + src_val[i] = this->src[i].template get<CType>(w, lane); + } + + dest_val = (CType)($expr); + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_3srcs = ''' +template<typename DataType> +void +$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename Base::Src0CType Src0T; + typedef typename Base::Src1CType Src1T; + typedef typename Base::Src2CType Src2T; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + + if ($dest_is_src_flag) { + dest_val = this->dest.template get<CType>(w, lane); + } + + Src0T src_val0 = this->src0.template get<Src0T>(w, lane); + Src1T src_val1 = this->src1.template get<Src1T>(w, lane); + Src2T src_val2 = this->src2.template get<Src2T>(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_1dt_2src_1dest = ''' +template<typename DataType> +void +$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename Base::DestCType DestT; + typedef CType Src0T; + typedef typename Base::Src1CType Src1T; + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestT dest_val; + if ($dest_is_src_flag) { + dest_val = this->dest.template get<DestT>(w, lane); + } + Src0T src_val0 = this->src0.template get<Src0T>(w, lane); + Src1T src_val1 = this->src1.template get<Src1T>(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_shift = ''' +template<typename DataType> +void +$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType dest_val; + + if ($dest_is_src_flag) { + dest_val = this->dest.template get<CType>(w, lane); + } + + CType src_val0 = this->src0.template get<CType>(w, lane); + uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane); + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_template_2dt = ''' +template<typename DestDataType, class SrcDataType> +void +$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst) +{ + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + DestCType dest_val; + SrcCType src_val[$num_srcs]; + + for (int i = 0; i < $num_srcs; ++i) { + src_val[i] = this->src[i].template get<SrcCType>(w, lane); + } + + dest_val = $expr; + + this->dest.set(w, lane, dest_val); + } + } +} + +''' + +exec_templates = { + 'ArithInst': exec_template_1dt_varsrcs, + 'CmovInst': exec_template_1dt_3srcs, + 'ExtractInsertInst': exec_template_1dt_3srcs, + 'ClassInst': exec_template_1dt_2src_1dest, + 'CmpInst': exec_template_2dt, + 'CvtInst': exec_template_2dt, + 'LdInst': '', + 'StInst': '', + 'SpecialInstNoSrc': exec_template_nodt_nosrc, + 'SpecialInst1Src': exec_template_nodt_1src, + 'SpecialInstNoSrcNoDest': '', +} + +############### +# +# Define code templates for the decoder cases +# +############### + +# decode template for nodt-opcode case +decode_nodt_template = ''' + case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);''' + +decode_case_prolog_class_inst = ''' + case BRIG_OPCODE_$brig_opcode_upper: + { + //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]); + BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType; + //switch (baseOp->kind) { + // case BRIG_OPERAND_REG: + // type = ((const BrigOperandReg*)baseOp)->type; + // break; + // case BRIG_OPERAND_IMMED: + // type = ((const BrigOperandImmed*)baseOp)->type; + // break; + // default: + // fatal("CLASS unrecognized kind of operand %d\\n", + // baseOp->kind); + //} + switch (type) {''' + +# common prolog for 1dt- or 2dt-opcode case: switch on data type +decode_case_prolog = ''' + case BRIG_OPCODE_$brig_opcode_upper: + { + switch (ib->type) {''' + +# single-level decode case entry (for 1dt opcodes) +decode_case_entry = \ +' case BRIG_TYPE_$type_name: return $constructor(ib, obj);' + +decode_store_prolog = \ +' case BRIG_TYPE_$type_name: {' + +decode_store_case_epilog = ''' + }''' + +decode_store_case_entry = \ +' return $constructor(ib, obj);' + +# common epilog for type switch +decode_case_epilog = ''' + default: fatal("$brig_opcode_upper: unrecognized type %d\\n", + ib->type); + } + } + break;''' + +# Additional templates for nested decode on a second type field (for +# compare and convert). These are used in place of the +# decode_case_entry template to create a second-level switch on on the +# second type field inside each case of the first-level type switch. +# Because the name and location of the second type can vary, the Brig +# instruction type must be provided in $brig_type, and the name of the +# second type field must be provided in $type_field. +decode_case2_prolog = ''' + case BRIG_TYPE_$type_name: + switch (((Brig$brig_type*)ib)->$type2_field) {''' + +decode_case2_entry = \ +' case BRIG_TYPE_$type2_name: return $constructor(ib, obj);' + +decode_case2_epilog = ''' + default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n", + ((Brig$brig_type*)ib)->$type2_field); + } + break;''' + +# Figure out how many source operands an expr needs by looking for the +# highest-numbered srcN value referenced. Since sources are numbered +# starting at 0, the return value is N+1. +def num_src_operands(expr): + if expr.find('src2') != -1: + return 3 + elif expr.find('src1') != -1: + return 2 + elif expr.find('src0') != -1: + return 1 + else: + return 0 + +############### +# +# Define final code generation methods +# +# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for +# generating actual instructions. +# +############### + +# Generate class declaration, exec function, and decode switch case +# for an brig_opcode with a single-level type switch. The 'types' +# parameter is a list or tuple of types for which the instruction +# should be instantiated. +def gen(brig_opcode, types=None, expr=None, base_class='ArithInst', + type2_info=None, constructor_prefix='new ', is_store=False): + brig_opcode_upper = brig_opcode.upper() + class_name = brig_opcode + opcode = class_name.lower() + + if base_class == 'ArithInst': + # note that expr must be provided with ArithInst so we can + # derive num_srcs for the template + assert expr + + if expr: + # Derive several bits of info from expr. If expr is not used, + # this info will be irrelevant. + num_srcs = num_src_operands(expr) + # if the RHS expression includes 'dest', then we're doing an RMW + # on the reg and we need to treat it like a source + dest_is_src = expr.find('dest') != -1 + dest_is_src_flag = str(dest_is_src).lower() # for C++ + if base_class in ['ShiftInst']: + expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr) + elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']: + expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr) + else: + expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr) + expr = re.sub(r'\bdest\b', r'dest_val', expr) + + # Strip template arguments off of base class before looking up + # appropriate templates + base_class_base = re.sub(r'<.*>$', '', base_class) + header_code(header_templates[base_class_base]) + + if base_class.startswith('SpecialInst'): + exec_code(exec_templates[base_class_base]) + elif base_class.startswith('ShiftInst'): + header_code(exec_template_shift) + else: + header_code(exec_templates[base_class_base]) + + if not types or isinstance(types, str): + # Just a single type + constructor = constructor_prefix + class_name + decoder_code(decode_nodt_template) + else: + # multiple types, need at least one level of decode + if brig_opcode == 'Class': + decoder_code(decode_case_prolog_class_inst) + else: + decoder_code(decode_case_prolog) + if not type2_info: + if is_store == False: + # single list of types, to basic one-level decode + for type_name in types: + full_class_name = '%s<%s>' % (class_name, type_name.upper()) + constructor = constructor_prefix + full_class_name + decoder_code(decode_case_entry) + else: + # single list of types, to basic one-level decode + for type_name in types: + decoder_code(decode_store_prolog) + type_size = int(re.findall(r'[0-9]+', type_name)[0]) + src_size = 32 + type_type = type_name[0] + full_class_name = '%s<%s,%s>' % (class_name, \ + type_name.upper(), \ + '%s%d' % \ + (type_type.upper(), \ + type_size)) + constructor = constructor_prefix + full_class_name + decoder_code(decode_store_case_entry) + decoder_code(decode_store_case_epilog) + else: + # need secondary type switch (convert, compare) + # unpack extra info on second switch + (type2_field, types2) = type2_info + brig_type = 'Inst%s' % brig_opcode + for type_name in types: + decoder_code(decode_case2_prolog) + fmt = '%s<%s,%%s>' % (class_name, type_name.upper()) + for type2_name in types2: + full_class_name = fmt % type2_name.upper() + constructor = constructor_prefix + full_class_name + decoder_code(decode_case2_entry) + + decoder_code(decode_case2_epilog) + + decoder_code(decode_case_epilog) + +############### +# +# Generate instructions +# +############### + +# handy abbreviations for common sets of types + +# arithmetic ops are typically defined only on 32- and 64-bit sizes +arith_int_types = ('S32', 'U32', 'S64', 'U64') +arith_float_types = ('F32', 'F64') +arith_types = arith_int_types + arith_float_types + +bit_types = ('B1', 'B32', 'B64') + +all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types + +# I think you might be able to do 'f16' memory ops too, but we'll +# ignore them for now. +mem_types = all_int_types + arith_float_types +mem_atom_types = all_int_types + ('B32', 'B64') + +##### Arithmetic & logical operations +gen('Add', arith_types, 'src0 + src1') +gen('Sub', arith_types, 'src0 - src1') +gen('Mul', arith_types, 'src0 * src1') +gen('Div', arith_types, 'src0 / src1') +gen('Min', arith_types, 'std::min(src0, src1)') +gen('Max', arith_types, 'std::max(src0, src1)') +gen('Gcnmin', arith_types, 'std::min(src0, src1)') + +gen('CopySign', arith_float_types, + 'src1 < 0 ? -std::abs(src0) : std::abs(src0)') +gen('Sqrt', arith_float_types, 'sqrt(src0)') +gen('Floor', arith_float_types, 'floor(src0)') + +# "fast" sqrt... same as slow for us +gen('Nsqrt', arith_float_types, 'sqrt(src0)') +gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)') +gen('Nrcp', arith_float_types, '1.0/src0') +gen('Fract', arith_float_types, + '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)') + +gen('Ncos', arith_float_types, 'cos(src0)'); +gen('Nsin', arith_float_types, 'sin(src0)'); + +gen('And', bit_types, 'src0 & src1') +gen('Or', bit_types, 'src0 | src1') +gen('Xor', bit_types, 'src0 ^ src1') + +gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)') +gen('Firstbit',bit_types, 'firstbit(src0)') +gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)') + +gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst') +gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst') + +# gen('Mul_hi', types=('s32','u32', '??')) +# gen('Mul24', types=('s32','u32', '??')) +gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)') + +gen('Abs', arith_types, 'std::abs(src0)') +gen('Neg', arith_types, '-src0') + +gen('Mov', bit_types, 'src0') +gen('Not', bit_types, 'heynot(src0)') + +# mad and fma differ only in rounding behavior, which we don't emulate +# also there's an integer form of mad, but not of fma +gen('Mad', arith_types, 'src0 * src1 + src2') +gen('Fma', arith_float_types, 'src0 * src1 + src2') + +#native floating point operations +gen('Nfma', arith_float_types, 'src0 * src1 + src2') + +gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst') +gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))') +gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))') + +# see base/bitfield.hh +gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)', + 'ExtractInsertInst') + +gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)', + 'ExtractInsertInst') + +##### Compare +gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)', + 'CmpInst', ('sourceType', arith_types + bit_types)) +gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst') + +##### Conversion + +# Conversion operations are only defined on B1, not B32 or B64 +cvt_types = ('B1',) + mem_types + +gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types)) + + +##### Load & Store +gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode') +gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode') +gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode', + is_store=True) +gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode') +gen('AtomicNoRet', mem_atom_types, base_class='StInst', + constructor_prefix='decode') + +gen('Cbr', base_class = 'LdInst', constructor_prefix='decode') +gen('Br', base_class = 'LdInst', constructor_prefix='decode') + +##### Special operations +def gen_special(brig_opcode, expr, dest_type='U32'): + num_srcs = num_src_operands(expr) + if num_srcs == 0: + base_class = 'SpecialInstNoSrc<%s>' % dest_type + elif num_srcs == 1: + base_class = 'SpecialInst1Src<%s>' % dest_type + else: + assert false + + gen(brig_opcode, None, expr, base_class) + +gen_special('WorkItemId', 'w->workitemid[src0][lane]') +gen_special('WorkItemAbsId', + 'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])') +gen_special('WorkGroupId', 'w->workgroupid[src0]') +gen_special('WorkGroupSize', 'w->workgroupsz[src0]') +gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]') +gen_special('GridSize', 'w->gridsz[src0]') +gen_special('GridGroups', + 'divCeil(w->gridsz[src0],w->workgroupsz[src0])') +gen_special('LaneId', 'lane') +gen_special('WaveId', 'w->dynwaveid') +gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64') + +# gen_special('CU'', ') + +gen('Ret', base_class='SpecialInstNoSrcNoDest') +gen('Barrier', base_class='SpecialInstNoSrcNoDest') +gen('MemFence', base_class='SpecialInstNoSrcNoDest') + +# Map magic instructions to the BrigSyscall opcode +# Magic instructions are defined in magic.hh +# +# In the future, real HSA kernel system calls can be implemented and coexist +# with magic instructions. +gen('Call', base_class='SpecialInstNoSrcNoDest') + +############### +# +# Generate file epilogs +# +############### +header_code.dedent() +header_code(''' +} // namespace HsailISA +''') + +# close off main decode switch +decoder_code.dedent() +decoder_code.dedent() +decoder_code(''' + default: fatal("unrecognized Brig opcode %d\\n", ib->opcode); + } // end switch(ib->opcode) + } // end decode() +} // namespace HsailISA +''') + +exec_code.dedent() +exec_code(''' +} // namespace HsailISA +''') + +############### +# +# Output accumulated code to files +# +############### +header_code.write(sys.argv[1]) +decoder_code.write(sys.argv[2]) +exec_code.write(sys.argv[3]) diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc new file mode 100644 index 000000000..0cd55d1d5 --- /dev/null +++ b/src/arch/hsail/generic_types.cc @@ -0,0 +1,47 @@ +#include "arch/hsail/generic_types.hh" +#include "base/misc.hh" + +using namespace Brig; + +namespace HsailISA +{ + Enums::GenericMemoryOrder + getGenericMemoryOrder(BrigMemoryOrder brig_memory_order) + { + switch(brig_memory_order) { + case BRIG_MEMORY_ORDER_NONE: + return Enums::MEMORY_ORDER_NONE; + case BRIG_MEMORY_ORDER_RELAXED: + return Enums::MEMORY_ORDER_RELAXED; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + return Enums::MEMORY_ORDER_SC_ACQUIRE; + case BRIG_MEMORY_ORDER_SC_RELEASE: + return Enums::MEMORY_ORDER_SC_RELEASE; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE; + default: + fatal("HsailISA::MemInst::getGenericMemoryOrder -> ", + "bad BrigMemoryOrder\n"); + } + } + + Enums::GenericMemoryScope + getGenericMemoryScope(BrigMemoryScope brig_memory_scope) + { + switch(brig_memory_scope) { + case BRIG_MEMORY_SCOPE_NONE: + return Enums::MEMORY_SCOPE_NONE; + case BRIG_MEMORY_SCOPE_WORKITEM: + return Enums::MEMORY_SCOPE_WORKITEM; + case BRIG_MEMORY_SCOPE_WORKGROUP: + return Enums::MEMORY_SCOPE_WORKGROUP; + case BRIG_MEMORY_SCOPE_AGENT: + return Enums::MEMORY_SCOPE_DEVICE; + case BRIG_MEMORY_SCOPE_SYSTEM: + return Enums::MEMORY_SCOPE_SYSTEM; + default: + fatal("HsailISA::MemInst::getGenericMemoryScope -> ", + "bad BrigMemoryScope\n"); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh new file mode 100644 index 000000000..50e430bef --- /dev/null +++ b/src/arch/hsail/generic_types.hh @@ -0,0 +1,16 @@ +#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__ +#define __ARCH_HSAIL_GENERIC_TYPES_HH__ + +#include "arch/hsail/Brig.h" +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" + +namespace HsailISA +{ + Enums::GenericMemoryOrder + getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order); + Enums::GenericMemoryScope + getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope); +} // namespace HsailISA + +#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__ diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh new file mode 100644 index 000000000..98a689664 --- /dev/null +++ b/src/arch/hsail/gpu_decoder.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_GPU_DECODER_HH__ +#define __ARCH_HSAIL_GPU_DECODER_HH__ + +#include <vector> + +#include "arch/hsail/gpu_types.hh" + +class BrigObject; +class GPUStaticInst; + +namespace Brig +{ + class BrigInstBase; +} + +namespace HsailISA +{ + class Decoder + { + public: + GPUStaticInst* decode(MachInst machInst); + + GPUStaticInst* + decode(RawMachInst inst) + { + return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr; + } + + RawMachInst + saveInst(GPUStaticInst *decodedInst) + { + decodedInsts.push_back(decodedInst); + + return decodedInsts.size() - 1; + } + + private: + static std::vector<GPUStaticInst*> decodedInsts; + }; +} // namespace HsailISA + +#endif // __ARCH_HSAIL_GPU_DECODER_HH__ diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh new file mode 100644 index 000000000..4b3a66a9a --- /dev/null +++ b/src/arch/hsail/gpu_types.hh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_GPU_TYPES_HH__ +#define __ARCH_HSAIL_GPU_TYPES_HH__ + +#include <cstdint> + +namespace Brig +{ + class BrigInstBase; +} + +class BrigObject; + +namespace HsailISA +{ + // A raw machine instruction represents the raw bits that + // our model uses to represent an actual instruction. In + // the case of HSAIL this is just an index into a list of + // instruction objects. + typedef uint64_t RawMachInst; + + // The MachInst is a representation of an instruction + // that has more information than just the machine code. + // For HSAIL the actual machine code is a BrigInstBase + // and the BrigObject contains more pertinent + // information related to operaands, etc. + + struct MachInst + { + const Brig::BrigInstBase *brigInstBase; + const BrigObject *brigObj; + }; +} + +#endif // __ARCH_HSAIL_GPU_TYPES_HH__ diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc new file mode 100644 index 000000000..d65279cc8 --- /dev/null +++ b/src/arch/hsail/insts/branch.cc @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/branch.hh" + +#include "gpu-compute/hsail_code.hh" + +namespace HsailISA +{ + GPUStaticInst* + decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // register operand. + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrnIndirectInst(ib, obj); + } else { + return new BrnDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new CbrIndirectInst(ib, obj); + } else { + return new CbrDirectInst(ib, obj); + } + } + + GPUStaticInst* + decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + // Detect direct vs indirect branch by seeing whether we have a + // second register operand (after the condition). + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + const Brig::BrigOperand *reg = obj->getOperand(op_offs); + + if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + return new BrIndirectInst(ib, obj); + } else { + return new BrDirectInst(ib, obj); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh new file mode 100644 index 000000000..54ad9a042 --- /dev/null +++ b/src/arch/hsail/insts/branch.hh @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__ +#define __ARCH_HSAIL_INSTS_BRANCH_HH__ + +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/wavefront.hh" + +namespace HsailISA +{ + + // The main difference between a direct branch and an indirect branch + // is whether the target is a register or a label, so we can share a + // lot of code if we template the base implementation on that type. + template<typename TargetType> + class BrnInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + TargetType target; + + BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "brn") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr*)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + + bool isSrcOperand(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return true; + } + + bool isDstOperand(int operandIndex) { + return false; + } + + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + + int getNumOperands() { + return 1; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename TargetType> + void + BrnInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template<typename TargetType> + void + BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrnDirectInst : public BrnInstBase<LabelOperand> + { + public: + BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase<LabelOperand>(ib, obj) + { + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrnIndirectInst : public BrnInstBase<SRegOperand> + { + public: + BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrnInstBase<SRegOperand>(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template<typename TargetType> + class CbrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + Brig::BrigWidth8_t width; + CRegOperand cond; + TargetType target; + + CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "cbr") + { + o_type = Enums::OT_BRANCH; + width = ((Brig::BrigInstBr *)ib)->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + cond.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + void execute(GPUDynInstPtr gpuDynInst); + // Assumption: Target is operand 0, Condition Register is operand 1 + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isVectorRegister(); + else + return false; + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.isCondRegister(); + else + return true; + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return target.isScalarRegister(); + else + return false; + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == 0) + return true; + return false; + } + // both Condition Register and Target are source operands + bool isDstOperand(int operandIndex) { + return false; + } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.opSize(); + else + return 1; + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + if (!operandIndex) + return target.regIndex(); + else + return -1; + } + + // Operands = Target, Condition Register + int getNumOperands() { + return 2; + } + }; + + template<typename TargetType> + void + CbrInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width != 1) { + widthClause = csprintf("_width(%d)", width); + } + + disassembly = csprintf("%s%s %s,%s", opcode, widthClause, + cond.disassemble(), target.disassemble()); + } + + template<typename TargetType> + void + CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const uint32_t curr_pc = w->pc(); + const uint32_t curr_rpc = w->rpc(); + const VectorMask curr_mask = w->execMask(); + + /** + * TODO: can we move this pop outside the instruction, and + * into the wavefront? + */ + w->popFromReconvergenceStack(); + + // immediate post-dominator instruction + const uint32_t rpc = static_cast<uint32_t>(ipdInstNum()); + if (curr_rpc != rpc) { + w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask); + } + + // taken branch + const uint32_t true_pc = getTargetPc(); + VectorMask true_mask; + for (unsigned int lane = 0; lane < VSZ; ++lane) { + true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane]; + } + + // not taken branch + const uint32_t false_pc = curr_pc + 1; + assert(true_pc != false_pc); + if (false_pc != rpc && true_mask.count() < curr_mask.count()) { + VectorMask false_mask = curr_mask & ~true_mask; + w->pushToReconvergenceStack(false_pc, rpc, false_mask); + } + + if (true_pc != rpc && true_mask.count()) { + w->pushToReconvergenceStack(true_pc, rpc, true_mask); + } + assert(w->pc() != curr_pc); + w->discardFetch(); + } + + + class CbrDirectInst : public CbrInstBase<LabelOperand> + { + public: + CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase<LabelOperand>(ib, obj) + { + } + // the source operand of a conditional branch is a Condition + // Register which is not stored in the VRF + // so we do not count it as a source-register operand + // even though, formally, it is one. + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class CbrIndirectInst : public CbrInstBase<SRegOperand> + { + public: + CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : CbrInstBase<SRegOperand>(ib, obj) + { + } + // one source operand of the conditional indirect branch is a Condition + // register which is not stored in the VRF so we do not count it + // as a source-register operand even though, formally, it is one. + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib, + const BrigObject *obj); + + template<typename TargetType> + class BrInstBase : public HsailGPUStaticInst + { + public: + void generateDisassembly(); + + ImmOperand<uint32_t> width; + TargetType target; + + BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "br") + { + o_type = Enums::OT_BRANCH; + width.init(((Brig::BrigInstBr *)ib)->width, obj); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + target.init(op_offs, obj); + o_type = Enums::OT_BRANCH; + } + + uint32_t getTargetPc() override { return target.getTarget(0, 0); } + + bool unconditionalJumpInstruction() override { return true; } + + void execute(GPUDynInstPtr gpuDynInst); + bool isVectorRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return target.regIndex(); + } + int getNumOperands() { return 1; } + }; + + template<typename TargetType> + void + BrInstBase<TargetType>::generateDisassembly() + { + std::string widthClause; + + if (width.bits != 1) { + widthClause = csprintf("_width(%d)", width.bits); + } + + disassembly = csprintf("%s%s %s", opcode, widthClause, + target.disassemble()); + } + + template<typename TargetType> + void + BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + if (getTargetPc() == w->rpc()) { + w->popFromReconvergenceStack(); + } else { + // Rpc and execution mask remain the same + w->pc(getTargetPc()); + } + w->discardFetch(); + } + + class BrDirectInst : public BrInstBase<LabelOperand> + { + public: + BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase<LabelOperand>(ib, obj) + { + } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + }; + + class BrIndirectInst : public BrInstBase<SRegOperand> + { + public: + BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj) + : BrInstBase<SRegOperand>(ib, obj) + { + } + int numSrcRegOperands() { return target.isVectorRegister(); } + int numDstRegOperands() { return 0; } + }; + + GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib, + const BrigObject *obj); +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__ diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh new file mode 100644 index 000000000..e2da501b9 --- /dev/null +++ b/src/arch/hsail/insts/decl.hh @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_DECL_HH__ +#define __ARCH_HSAIL_INSTS_DECL_HH__ + +#include <cmath> + +#include "arch/hsail/generic_types.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" +#include "debug/HSAIL.hh" +#include "enums/OpType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +namespace HsailISA +{ + template<typename _DestOperand, typename _SrcOperand> + class HsailOperandType + { + public: + typedef _DestOperand DestOperand; + typedef _SrcOperand SrcOperand; + }; + + typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType; + typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType; + typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType; + + // The IsBits parameter serves only to disambiguate tbhe B* types from + // the U* types, which otherwise would be identical (and + // indistinguishable). + template<typename _OperandType, typename _CType, Enums::MemType _memType, + vgpr_type _vgprType, int IsBits=0> + class HsailDataType + { + public: + typedef _OperandType OperandType; + typedef _CType CType; + static const Enums::MemType memType = _memType; + static const vgpr_type vgprType = _vgprType; + static const char *label; + }; + + typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1; + typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8; + + typedef HsailDataType<SRegOperandType, uint16_t, + Enums::M_U16, VT_32, 1> B16; + + typedef HsailDataType<SRegOperandType, uint32_t, + Enums::M_U32, VT_32, 1> B32; + + typedef HsailDataType<DRegOperandType, uint64_t, + Enums::M_U64, VT_64, 1> B64; + + typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8; + typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16; + typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32; + typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64; + + typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8; + typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16; + typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32; + typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64; + + typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32; + typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64; + + template<typename DestOperandType, typename SrcOperandType, + int NumSrcOperands> + class CommonInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename SrcOperandType::SrcOperand src[NumSrcOperands]; + + void + generateDisassembly() + { + disassembly = csprintf("%s%s %s", opcode, opcode_suffix(), + dest.disassemble()); + + for (int i = 0; i < NumSrcOperands; ++i) { + disassembly += ","; + disassembly += src[i].disassemble(); + } + } + + virtual std::string opcode_suffix() = 0; + + public: + CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + + dest.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + return false; + } + + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= NumSrcOperands) + return true; + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + + if (operandIndex < NumSrcOperands) + return src[operandIndex].regIndex(); + else + return dest.regIndex(); + } + int numSrcRegOperands() { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return NumSrcOperands + 1; } + }; + + template<typename DataType, int NumSrcOperands> + class ArithInst : public CommonInstBase<typename DataType::OperandType, + typename DataType::OperandType, + NumSrcOperands> + { + public: + std::string opcode_suffix() { return csprintf("_%s", DataType::label); } + + ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : CommonInstBase<typename DataType::OperandType, + typename DataType::OperandType, + NumSrcOperands>(ib, obj, opcode) + { + } + }; + + template<typename DestOperandType, typename Src0OperandType, + typename Src1OperandType, typename Src2OperandType> + class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + typename Src2OperandType::SrcOperand src2; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble(), + src2.disassemble()); + } + + public: + ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, + const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 3); + src2.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else if (operandIndex == 2) + return src2.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else if (operandIndex == 2) + return src2.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else if (operandIndex == 2) + return src2.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 3) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 3) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else if (operandIndex == 2) + return src2.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else if (operandIndex == 2) + return src2.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + if (src2.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 4; } + }; + + template<typename DestDataType, typename Src0DataType, + typename Src1DataType, typename Src2DataType> + class ThreeNonUniformSourceInst : + public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType, + typename Src2DataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + typedef typename Src2DataType::CType Src2CType; + + ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType, + typename Src2DataType::OperandType>(ib, + obj, opcode) + { + } + }; + + template<typename DataType> + class CmovInst : public ThreeNonUniformSourceInst<DataType, B1, + DataType, DataType> + { + public: + CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst<DataType, B1, DataType, + DataType>(ib, obj, opcode) + { + } + }; + + template<typename DataType> + class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType, + DataType, U32, + U32> + { + public: + ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : ThreeNonUniformSourceInst<DataType, DataType, U32, + U32>(ib, obj, opcode) + { + } + }; + + template<typename DestOperandType, typename Src0OperandType, + typename Src1OperandType> + class TwoNonUniformSourceInstBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + typename Src0OperandType::SrcOperand src0; + typename Src1OperandType::SrcOperand src1; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(), + src0.disassemble(), src1.disassemble()); + } + + + public: + TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : HsailGPUStaticInst(obj, opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isVectorRegister(); + else if (operandIndex == 1) + return src1.isVectorRegister(); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isCondRegister(); + else if (operandIndex == 1) + return src1.isCondRegister(); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.isScalarRegister(); + else if (operandIndex == 1) + return src1.isScalarRegister(); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < 2) + return true; + else + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex >= 2) + return true; + else + return false; + } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.opSize(); + else if (operandIndex == 1) + return src1.opSize(); + else + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (!operandIndex) + return src0.regIndex(); + else if (operandIndex == 1) + return src1.regIndex(); + else + return dest.regIndex(); + } + + int numSrcRegOperands() { + int operands = 0; + if (src0.isVectorRegister() == true) { + operands++; + } + if (src1.isVectorRegister() == true) { + operands++; + } + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 3; } + }; + + template<typename DestDataType, typename Src0DataType, + typename Src1DataType> + class TwoNonUniformSourceInst : + public TwoNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + typedef typename Src0DataType::CType Src0CType; + typedef typename Src1DataType::CType Src1CType; + + TwoNonUniformSourceInst(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *opcode) + : TwoNonUniformSourceInstBase<typename DestDataType::OperandType, + typename Src0DataType::OperandType, + typename Src1DataType::OperandType>(ib, + obj, opcode) + { + } + }; + + // helper function for ClassInst + template<typename T> + bool + fpclassify(T src0, uint32_t src1) + { + int fpclass = std::fpclassify(src0); + + if ((src1 & 0x3) && (fpclass == FP_NAN)) { + return true; + } + + if (src0 <= -0.0) { + if ((src1 & 0x4) && fpclass == FP_INFINITE) + return true; + if ((src1 & 0x8) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x10) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x20) && fpclass == FP_ZERO) + return true; + } else { + if ((src1 & 0x40) && fpclass == FP_ZERO) + return true; + if ((src1 & 0x80) && fpclass == FP_SUBNORMAL) + return true; + if ((src1 & 0x100) && fpclass == FP_NORMAL) + return true; + if ((src1 & 0x200) && fpclass == FP_INFINITE) + return true; + } + return false; + } + + template<typename DataType> + class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32> + { + public: + ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode) + { + } + }; + + template<typename DataType> + class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32> + { + public: + ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *opcode) + : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode) + { + } + }; + + // helper function for CmpInst + template<typename T> + bool + compare(T src0, T src1, Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + case BRIG_COMPARE_EQU: + case BRIG_COMPARE_SEQ: + case BRIG_COMPARE_SEQU: + return (src0 == src1); + + case BRIG_COMPARE_NE: + case BRIG_COMPARE_NEU: + case BRIG_COMPARE_SNE: + case BRIG_COMPARE_SNEU: + return (src0 != src1); + + case BRIG_COMPARE_LT: + case BRIG_COMPARE_LTU: + case BRIG_COMPARE_SLT: + case BRIG_COMPARE_SLTU: + return (src0 < src1); + + case BRIG_COMPARE_LE: + case BRIG_COMPARE_LEU: + case BRIG_COMPARE_SLE: + case BRIG_COMPARE_SLEU: + return (src0 <= src1); + + case BRIG_COMPARE_GT: + case BRIG_COMPARE_GTU: + case BRIG_COMPARE_SGT: + case BRIG_COMPARE_SGTU: + return (src0 > src1); + + case BRIG_COMPARE_GE: + case BRIG_COMPARE_GEU: + case BRIG_COMPARE_SGE: + case BRIG_COMPARE_SGEU: + return (src0 >= src1); + + case BRIG_COMPARE_NUM: + case BRIG_COMPARE_SNUM: + return (src0 == src0) || (src1 == src1); + + case BRIG_COMPARE_NAN: + case BRIG_COMPARE_SNAN: + return (src0 != src0) || (src1 != src1); + + default: + fatal("Bad cmpOp value %d\n", (int)cmpOp); + } + } + + template<typename T> + int32_t + firstbit(T src0) + { + if (!src0) + return -1; + + //handle positive and negative numbers + T tmp = (src0 < 0) ? (~src0) : (src0); + + //the starting pos is MSB + int pos = 8 * sizeof(T) - 1; + int cnt = 0; + + //search the first bit set to 1 + while (!(tmp & (1 << pos))) { + ++cnt; + --pos; + } + return cnt; + } + + const char* cmpOpToString(Brig::BrigCompareOperation cmpOp); + + template<typename DestOperandType, typename SrcOperandType> + class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType, + 2> + { + protected: + Brig::BrigCompareOperation cmpOp; + + public: + CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj, + _opcode) + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP); + Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib; + cmpOp = (Brig::BrigCompareOperation)i->compare; + } + }; + + template<typename DestDataType, typename SrcDataType> + class CmpInst : public CmpInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType> + { + public: + std::string + opcode_suffix() + { + return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp), + DestDataType::label, SrcDataType::label); + } + + CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CmpInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType>(ib, obj, _opcode) + { + } + }; + + template<typename DestDataType, typename SrcDataType> + class CvtInst : public CommonInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType, 1> + { + public: + std::string opcode_suffix() + { + return csprintf("_%s_%s", DestDataType::label, SrcDataType::label); + } + + CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : CommonInstBase<typename DestDataType::OperandType, + typename SrcDataType::OperandType, + 1>(ib, obj, _opcode) + { + } + }; + + class SpecialInstNoSrcNoDest : public HsailGPUStaticInst + { + public: + SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 0; } + }; + + template<typename DestOperandType> + class SpecialInstNoSrcBase : public HsailGPUStaticInst + { + protected: + typename DestOperandType::DestOperand dest; + + void generateDisassembly() + { + disassembly = csprintf("%s %s", opcode, dest.disassemble()); + } + + public: + SpecialInstNoSrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template<typename DestDataType> + class SpecialInstNoSrc : + public SpecialInstNoSrcBase<typename DestDataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj, + _opcode) + { + } + }; + + template<typename DestOperandType> + class SpecialInst1SrcBase : public HsailGPUStaticInst + { + protected: + typedef int SrcCType; // used in execute() template + + typename DestOperandType::DestOperand dest; + ImmOperand<SrcCType> src0; + + void + generateDisassembly() + { + disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(), + src0.disassemble()); + } + + public: + SpecialInst1SrcBase(const Brig::BrigInstBase *ib, + const BrigObject *obj, const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + } + bool isVectorRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return true; } + int getOperandSize(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.opSize(); + } + int getRegisterIndex(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return dest.regIndex(); + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() { return 1; } + }; + + template<typename DestDataType> + class SpecialInst1Src : + public SpecialInst1SrcBase<typename DestDataType::OperandType> + { + public: + typedef typename DestDataType::CType DestCType; + + SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj, + _opcode) + { + } + }; + + class Ret : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Ret(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "ret") + { + o_type = Enums::OT_RET; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class Barrier : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + uint8_t width; + + Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "barrier") + { + o_type = Enums::OT_BARRIER; + assert(ib->base.kind == Brig::BRIG_KIND_INST_BR); + width = (uint8_t)((Brig::BrigInstBr*)ib)->width; + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + class MemFence : public SpecialInstNoSrcNoDest + { + public: + typedef SpecialInstNoSrcNoDest Base; + + Brig::BrigMemoryOrder memFenceMemOrder; + Brig::BrigMemoryScope memFenceScopeSegGroup; + Brig::BrigMemoryScope memFenceScopeSegGlobal; + Brig::BrigMemoryScope memFenceScopeSegImage; + + MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj) + : Base(ib, obj, "memfence") + { + assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE); + + memFenceScopeSegGlobal = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope; + + memFenceScopeSegGroup = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope; + + memFenceScopeSegImage = (Brig::BrigMemoryScope) + ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope; + + memFenceMemOrder = (Brig::BrigMemoryOrder) + ((Brig::BrigInstMemFence*)ib)->memoryOrder; + + // set o_type based on scopes + if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE && + memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_BOTH_MEMFENCE; + } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_GLOBAL_MEMFENCE; + } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { + o_type = Enums::OT_SHARED_MEMFENCE; + } else { + fatal("MemFence constructor: bad scope specifiers\n"); + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + Wavefront *wave = gpuDynInst->wavefront(); + wave->computeUnit->injectGlobalMemFence(gpuDynInst); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + // 2 cases: + // * memfence to a sequentially consistent memory (e.g., LDS). + // These can be handled as no-ops. + // * memfence to a relaxed consistency cache (e.g., Hermes, Viper, + // etc.). We send a packet, tagged with the memory order and + // scope, and let the GPU coalescer handle it. + + if (o_type == Enums::OT_GLOBAL_MEMFENCE || + o_type == Enums::OT_BOTH_MEMFENCE) { + gpuDynInst->simdId = w->simdId; + gpuDynInst->wfSlotId = w->wfSlotId; + gpuDynInst->wfDynId = w->wfDynId; + gpuDynInst->kern_id = w->kern_id; + gpuDynInst->cu_id = w->computeUnit->cu_id; + + gpuDynInst->memoryOrder = + getGenericMemoryOrder(memFenceMemOrder); + gpuDynInst->scope = + getGenericMemoryScope(memFenceScopeSegGlobal); + gpuDynInst->useContinuation = false; + GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); + gmp->getGMReqFIFO().push(gpuDynInst); + + w->wr_gm_reqs_in_pipe--; + w->rd_gm_reqs_in_pipe--; + w->mem_reqs_in_pipe--; + w->outstanding_reqs++; + } else if (o_type == Enums::OT_SHARED_MEMFENCE) { + // no-op + } else { + fatal("MemFence execute: bad o_type\n"); + } + } + }; + + class Call : public HsailGPUStaticInst + { + public: + // private helper functions + void calcAddr(Wavefront* w, GPUDynInstPtr m); + + void + generateDisassembly() + { + if (dest.disassemble() == "") { + disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(), + src1.disassemble()); + } else { + disassembly = csprintf("%s %s (%s) (%s)", opcode, + src0.disassemble(), dest.disassemble(), + src1.disassemble()); + } + } + + bool + isPseudoOp() + { + std::string func_name = src0.disassemble(); + if (func_name.find("__gem5_hsail_op") != std::string::npos) { + return true; + } + return false; + } + + // member variables + ListOperand dest; + FunctionRefOperand src0; + ListOperand src1; + HsailCode *func_ptr; + + // exec function for pseudo instructions mapped on top of call opcode + void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst); + + // user-defined pseudo instructions + void MagicPrintLane(Wavefront *w); + void MagicPrintLane64(Wavefront *w); + void MagicPrintWF32(Wavefront *w); + void MagicPrintWF64(Wavefront *w); + void MagicPrintWFFloat(Wavefront *w); + void MagicSimBreak(Wavefront *w); + void MagicPrefixSum(Wavefront *w); + void MagicReduction(Wavefront *w); + void MagicMaskLower(Wavefront *w); + void MagicMaskUpper(Wavefront *w); + void MagicJoinWFBar(Wavefront *w); + void MagicWaitWFBar(Wavefront *w); + void MagicPanic(Wavefront *w); + + void MagicAtomicNRAddGlobalU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicAtomicNRAddGroupU32Reg(Wavefront *w, + GPUDynInstPtr gpuDynInst); + + void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst); + + void MagicXactCasLd(Wavefront *w); + void MagicMostSigThread(Wavefront *w); + void MagicMostSigBroadcast(Wavefront *w); + + void MagicPrintWF32ID(Wavefront *w); + void MagicPrintWFID64(Wavefront *w); + + Call(const Brig::BrigInstBase *ib, const BrigObject *obj) + : HsailGPUStaticInst(obj, "call") + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + src0.init(op_offs, obj); + + func_ptr = nullptr; + std::string func_name = src0.disassemble(); + if (!isPseudoOp()) { + func_ptr = dynamic_cast<HsailCode*>(obj-> + getFunction(func_name)); + + if (!func_ptr) + fatal("call::exec cannot find function: %s\n", func_name); + } + + op_offs = obj->getOperandPtr(ib->operands, 2); + src1.init(op_offs, obj); + } + + bool isVectorRegister(int operandIndex) { return false; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return -1; } + + void + execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + std::string func_name = src0.disassemble(); + if (isPseudoOp()) { + execPseudoInst(w, gpuDynInst); + } else { + fatal("Native HSAIL functions are not yet implemented: %s\n", + func_name); + } + } + int numSrcRegOperands() { return 0; } + int numDstRegOperands() { return 0; } + int getNumOperands() { return 2; } + }; + + template<typename T> T heynot(T arg) { return ~arg; } + template<> inline bool heynot<bool>(bool arg) { return !arg; } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_DECL_HH__ diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc new file mode 100644 index 000000000..bbaeb13e6 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.cc @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "arch/hsail/insts/gpu_static_inst.hh" + +#include "gpu-compute/brig_object.hh" + +namespace HsailISA +{ + HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj, + const std::string &opcode) + : GPUStaticInst(opcode), hsailCode(obj->currentCode) + { + } + + void + HsailGPUStaticInst::generateDisassembly() + { + disassembly = opcode; + } + + const std::string& + HsailGPUStaticInst::disassemble() + { + if (disassembly.empty()) { + generateDisassembly(); + assert(!disassembly.empty()); + } + + return disassembly; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh new file mode 100644 index 000000000..29aab1f70 --- /dev/null +++ b/src/arch/hsail/insts/gpu_static_inst.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ +#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing HSAIL GPU static instructions. + */ + +#include "gpu-compute/gpu_static_inst.hh" + +class BrigObject; +class HsailCode; + +namespace HsailISA +{ + class HsailGPUStaticInst : public GPUStaticInst + { + public: + HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode); + void generateDisassembly(); + const std::string &disassemble(); + uint32_t instSize() { return 4; } + + protected: + HsailCode *hsailCode; + }; +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__ diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc new file mode 100644 index 000000000..4e70bf46a --- /dev/null +++ b/src/arch/hsail/insts/main.cc @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/decl.hh" +#include "debug/GPUExec.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/simple_pool_manager.hh" + +namespace HsailISA +{ + template<> const char *B1::label = "b1"; + template<> const char *B8::label = "b8"; + template<> const char *B16::label = "b16"; + template<> const char *B32::label = "b32"; + template<> const char *B64::label = "b64"; + + template<> const char *S8::label = "s8"; + template<> const char *S16::label = "s16"; + template<> const char *S32::label = "s32"; + template<> const char *S64::label = "s64"; + + template<> const char *U8::label = "u8"; + template<> const char *U16::label = "u16"; + template<> const char *U32::label = "u32"; + template<> const char *U64::label = "u64"; + + template<> const char *F32::label = "f32"; + template<> const char *F64::label = "f64"; + + const char* + cmpOpToString(Brig::BrigCompareOperation cmpOp) + { + using namespace Brig; + + switch (cmpOp) { + case BRIG_COMPARE_EQ: + return "eq"; + case BRIG_COMPARE_NE: + return "ne"; + case BRIG_COMPARE_LT: + return "lt"; + case BRIG_COMPARE_LE: + return "le"; + case BRIG_COMPARE_GT: + return "gt"; + case BRIG_COMPARE_GE: + return "ge"; + case BRIG_COMPARE_EQU: + return "equ"; + case BRIG_COMPARE_NEU: + return "neu"; + case BRIG_COMPARE_LTU: + return "ltu"; + case BRIG_COMPARE_LEU: + return "leu"; + case BRIG_COMPARE_GTU: + return "gtu"; + case BRIG_COMPARE_GEU: + return "geu"; + case BRIG_COMPARE_NUM: + return "num"; + case BRIG_COMPARE_NAN: + return "nan"; + case BRIG_COMPARE_SEQ: + return "seq"; + case BRIG_COMPARE_SNE: + return "sne"; + case BRIG_COMPARE_SLT: + return "slt"; + case BRIG_COMPARE_SLE: + return "sle"; + case BRIG_COMPARE_SGT: + return "sgt"; + case BRIG_COMPARE_SGE: + return "sge"; + case BRIG_COMPARE_SGEU: + return "sgeu"; + case BRIG_COMPARE_SEQU: + return "sequ"; + case BRIG_COMPARE_SNEU: + return "sneu"; + case BRIG_COMPARE_SLTU: + return "sltu"; + case BRIG_COMPARE_SLEU: + return "sleu"; + case BRIG_COMPARE_SNUM: + return "snum"; + case BRIG_COMPARE_SNAN: + return "snan"; + case BRIG_COMPARE_SGTU: + return "sgtu"; + default: + return "unknown"; + } + } + + void + Ret::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + const VectorMask &mask = w->get_pred(); + + // mask off completed work-items + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->init_mask[lane] = 0; + } + + } + + // delete extra instructions fetched for completed work-items + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) { + w->dropFetch = true; + } + + // if all work-items have completed, then wave-front is done + if (w->init_mask.none()) { + w->status = Wavefront::S_STOPPED; + + int32_t refCount = w->computeUnit->getLds(). + decreaseRefCounter(w->dispatchid, w->wg_id); + + DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", + w->computeUnit->cu_id, w->wg_id, refCount); + + // free the vector registers of the completed wavefront + w->computeUnit->vectorRegsReserved[w->simdId] -= + w->reservedVectorRegs; + + assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0); + + uint32_t endIndex = (w->startVgprIndex + + w->reservedVectorRegs - 1) % + w->computeUnit->vrf[w->simdId]->numRegs(); + + w->computeUnit->vrf[w->simdId]->manager-> + freeRegion(w->startVgprIndex, endIndex); + + w->reservedVectorRegs = 0; + w->startVgprIndex = 0; + w->computeUnit->completedWfs++; + + DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId); + + if (!refCount) { + // Notify Memory System of Kernel Completion + // Kernel End = isKernel + isRelease + w->status = Wavefront::S_RETURNING; + GPUDynInstPtr local_mempacket = gpuDynInst; + local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE; + local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM; + local_mempacket->useContinuation = false; + local_mempacket->simdId = w->simdId; + local_mempacket->wfSlotId = w->wfSlotId; + local_mempacket->wfDynId = w->wfDynId; + w->computeUnit->injectGlobalMemFence(local_mempacket, true); + } else { + w->computeUnit->shader->dispatcher->scheduleDispatch(); + } + } + } + + void + Barrier::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + assert(w->barrier_cnt == w->old_barrier_cnt); + w->barrier_cnt = w->old_barrier_cnt + 1; + w->stalledAtBarrier = true; + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc new file mode 100644 index 000000000..97d4c902b --- /dev/null +++ b/src/arch/hsail/insts/mem.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/insts/mem.hh" + +#include "arch/hsail/Brig.h" +#include "enums/OpType.hh" + +using namespace Brig; + +namespace HsailISA +{ + const char* atomicOpToString(BrigAtomicOperation brigOp); + + Enums::MemOpType + brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp) + { + if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_AAND; + case BRIG_ATOMIC_OR: + return Enums::MO_AOR; + case BRIG_ATOMIC_XOR: + return Enums::MO_AXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ACAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_AEXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_AADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_AINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ADEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_AMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_AMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ASUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return Enums::MO_ANRAND; + case BRIG_ATOMIC_OR: + return Enums::MO_ANROR; + case BRIG_ATOMIC_XOR: + return Enums::MO_ANRXOR; + case BRIG_ATOMIC_CAS: + return Enums::MO_ANRCAS; + case BRIG_ATOMIC_EXCH: + return Enums::MO_ANREXCH; + case BRIG_ATOMIC_ADD: + return Enums::MO_ANRADD; + case BRIG_ATOMIC_WRAPINC: + return Enums::MO_ANRINC; + case BRIG_ATOMIC_WRAPDEC: + return Enums::MO_ANRDEC; + case BRIG_ATOMIC_MIN: + return Enums::MO_ANRMIN; + case BRIG_ATOMIC_MAX: + return Enums::MO_ANRMAX; + case BRIG_ATOMIC_SUB: + return Enums::MO_ANRSUB; + default: + fatal("Bad BrigAtomicOperation code %d\n", brigOp); + } + } else { + fatal("Bad BrigAtomicOpcode %d\n", brigOpCode); + } + } + + const char* + atomicOpToString(BrigAtomicOperation brigOp) + { + switch (brigOp) { + case BRIG_ATOMIC_AND: + return "and"; + case BRIG_ATOMIC_OR: + return "or"; + case BRIG_ATOMIC_XOR: + return "xor"; + case BRIG_ATOMIC_CAS: + return "cas"; + case BRIG_ATOMIC_EXCH: + return "exch"; + case BRIG_ATOMIC_ADD: + return "add"; + case BRIG_ATOMIC_WRAPINC: + return "inc"; + case BRIG_ATOMIC_WRAPDEC: + return "dec"; + case BRIG_ATOMIC_MIN: + return "min"; + case BRIG_ATOMIC_MAX: + return "max"; + case BRIG_ATOMIC_SUB: + return "sub"; + default: + return "unknown"; + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh new file mode 100644 index 000000000..d3ce76dee --- /dev/null +++ b/src/arch/hsail/insts/mem.hh @@ -0,0 +1,1629 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_INSTS_MEM_HH__ +#define __ARCH_HSAIL_INSTS_MEM_HH__ + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/gpu_static_inst.hh" +#include "arch/hsail/operand.hh" + +namespace HsailISA +{ + class MemInst + { + public: + MemInst() : size(0), addr_operand(nullptr) { } + + MemInst(Enums::MemType m_type) + { + if (m_type == Enums::M_U64 || + m_type == Enums::M_S64 || + m_type == Enums::M_F64) { + size = 8; + } else if (m_type == Enums::M_U32 || + m_type == Enums::M_S32 || + m_type == Enums::M_F32) { + size = 4; + } else if (m_type == Enums::M_U16 || + m_type == Enums::M_S16 || + m_type == Enums::M_F16) { + size = 2; + } else { + size = 1; + } + + addr_operand = nullptr; + } + + void + init_addr(AddrOperandBase *_addr_operand) + { + addr_operand = _addr_operand; + } + + private: + int size; + AddrOperandBase *addr_operand; + + public: + int getMemOperandSize() { return size; } + AddrOperandBase *getAddressOperand() { return addr_operand; } + }; + + template<typename DestOperandType, typename AddrOperandType> + class LdaInstBase : public HsailGPUStaticInst + { + public: + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + return 1; + } + }; + + template<typename DestDataType, typename AddrOperandType> + class LdaInst : + public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>, + public MemInst + { + public: + void generateDisassembly(); + + LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdaInstBase<typename DestDataType::OperandType, + AddrOperandType>(ib, obj, _opcode) + { + init_addr(&this->addr); + } + + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename DataType> + GPUStaticInst* + decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands, 1); + BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj); + + if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas"); + } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (regDataType.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas"); + default: + fatal("Bad ldas register operand type %d\n", regDataType.type); + } + } else { + fatal("Bad ldas register operand kind %d\n", regDataType.kind); + } + } + + template<typename MemOperandType, typename DestOperandType, + typename AddrOperandType> + class LdInstBase : public HsailGPUStaticInst + { + public: + Brig::BrigWidth8_t width; + typename DestOperandType::DestOperand dest; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigMemoryScope memoryScope; + unsigned int equivClass; + bool isArgLoad() + { + return segment == Brig::BRIG_SEGMENT_KERNARG || + segment == Brig::BRIG_SEGMENT_ARG; + } + void + initLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = ldst->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + memoryScope = (BrigMemoryScope)at->memoryScope; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_READ; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_READ; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_READ; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_READ; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_READ; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_READ; + break; + + case BRIG_SEGMENT_KERNARG: + o_type = Enums::OT_KERN_READ; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("Ld: segment %d not supported\n", segment); + } + + width = BRIG_WIDTH_1; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands,1); + addr.init(op_offs, obj); + } + + LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_LD) { + initLd(ib, obj, _opcode); + } else { + initAtomicLd(ib, obj, _opcode); + } + } + + int numSrcRegOperands() { return(this->addr.isVectorRegister()); } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (this->addr.isVectorRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isVectorRegister() : + this->addr.isVectorRegister()); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isCondRegister() : + this->addr.isCondRegister()); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.isScalarRegister() : + this->addr.isScalarRegister()); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex > 0) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return(operandIndex == 0); + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.opSize() : + this->addr.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return((operandIndex == 0) ? dest.regIndex() : + this->addr.regIndex()); + } + }; + + template<typename MemDataType, typename DestDataType, + typename AddrOperandType> + class LdInst : + public LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, AddrOperandType>, + public MemInst + { + typename DestDataType::OperandType::DestOperand dest_vect[4]; + uint16_t num_dest_operands; + void generateDisassembly(); + + public: + LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>(ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + + if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)brigOp; + + num_dest_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_dest_operands <= 4); + } else { + num_dest_operands = 1; + } + + if (num_dest_operands > 1) { + assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_dest_operands; ++i) { + dest_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_dest_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_dest_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_dest_operands; ++k) { + + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + // load from shared memory + *d = gpuDynInst->wavefront()->ldsChunk-> + read<c0>(vaddr); + } else { + Request *req = new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + // if this load has acquire semantics, + // set the response continuation function + // to perform an Acquire request + gpuDynInst->execContinuation = + &GPUStaticInst::execLdAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when + // the load completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + private: + void + execLdAcq(GPUDynInstPtr gpuDynInst) override + { + // after the load has complete and if the load has acquire + // semantics, issue an acquire request. + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + + public: + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + if (num_dest_operands > 1) { + return dest_vect[operandIndex].isVectorRegister(); + } + else if (num_dest_operands == 1) { + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isVectorRegister(); + } + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isCondRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isCondRegister(); + else if (num_dest_operands == 1) + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isScalarRegister()); + if (num_dest_operands > 1) + return dest_vect[operandIndex].isScalarRegister(); + else if (num_dest_operands == 1) + return LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.isVectorRegister()); + return false; + } + bool isDstOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return false; + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.opSize()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].opSize()); + else if (num_dest_operands == 1) + return(LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.opSize()); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if ((num_dest_operands != getNumOperands()) && + (operandIndex == (getNumOperands()-1))) + return(this->addr.regIndex()); + if (num_dest_operands > 1) + return(dest_vect[operandIndex].regIndex()); + else if (num_dest_operands == 1) + return(LdInstBase<typename MemDataType::CType, + typename DestDataType::OperandType, + AddrOperandType>::dest.regIndex()); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return(num_dest_operands+1); + else + return(num_dest_operands); + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename MemDT, typename DestDT> + GPUStaticInst* + decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,1); + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld"); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new LdInst<MemDT, DestDT, + SRegAddrOperand>(ib, obj, "ld"); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new LdInst<MemDT, DestDT, + DRegAddrOperand>(ib, obj, "ld"); + default: + fatal("Bad ld register operand type %d\n", tmp.regKind); + } + } else { + fatal("Bad ld register operand kind %d\n", tmp.kind); + } + } + + template<typename MemDT> + GPUStaticInst* + decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned op_offs = obj->getOperandPtr(ib->operands,0); + BrigRegOperandInfo dest = findRegDataType(op_offs, obj); + + assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER || + dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + switch(dest.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + case Brig::BRIG_TYPE_B16: + case Brig::BRIG_TYPE_B32: + return decodeLd2<MemDT, B32>(ib, obj); + case Brig::BRIG_TYPE_U8: + case Brig::BRIG_TYPE_U16: + case Brig::BRIG_TYPE_U32: + return decodeLd2<MemDT, U32>(ib, obj); + case Brig::BRIG_TYPE_S8: + case Brig::BRIG_TYPE_S16: + case Brig::BRIG_TYPE_S32: + return decodeLd2<MemDT, S32>(ib, obj); + case Brig::BRIG_TYPE_F16: + case Brig::BRIG_TYPE_F32: + return decodeLd2<MemDT, U32>(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + case Brig::BRIG_REGISTER_KIND_DOUBLE: + switch (ib->type) { + case Brig::BRIG_TYPE_B64: + return decodeLd2<MemDT, B64>(ib, obj); + case Brig::BRIG_TYPE_U64: + return decodeLd2<MemDT, U64>(ib, obj); + case Brig::BRIG_TYPE_S64: + return decodeLd2<MemDT, S64>(ib, obj); + case Brig::BRIG_TYPE_F64: + return decodeLd2<MemDT, U64>(ib, obj); + default: + fatal("Bad ld register operand type %d, %d\n", + dest.regKind, ib->type); + }; + default: + fatal("Bad ld register operand type %d, %d\n", dest.regKind, + ib->type); + } + } + + template<typename MemDataType, typename SrcOperandType, + typename AddrOperandType> + class StInstBase : public HsailGPUStaticInst + { + public: + typename SrcOperandType::SrcOperand src; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryScope memoryScope; + Brig::BrigMemoryOrder memoryOrder; + unsigned int equivClass; + + void + initSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstMem *ldst = (const BrigInstMem*)ib; + + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const BrigOperand *baseOp = obj->getOperand(op_offs); + + if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || + (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { + src.init(op_offs, obj); + } + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } + + void + initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + equivClass = 0; + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_WRITE; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_WRITE; + break; + + case BRIG_SEGMENT_PRIVATE: + o_type = Enums::OT_PRIVATE_WRITE; + break; + + case BRIG_SEGMENT_READONLY: + o_type = Enums::OT_READONLY_WRITE; + break; + + case BRIG_SEGMENT_SPILL: + o_type = Enums::OT_SPILL_WRITE; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_WRITE; + break; + + case BRIG_SEGMENT_ARG: + o_type = Enums::OT_ARG; + break; + + default: + panic("St: segment %d not supported\n", segment); + } + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + src.init(op_offs, obj); + } + + StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + if (ib->opcode == BRIG_OPCODE_ST) { + initSt(ib, obj, _opcode); + } else { + initAtomicSt(ib, obj, _opcode); + } + } + + int numDstRegOperands() { return 0; } + int numSrcRegOperands() + { + return src.isVectorRegister() + this->addr.isVectorRegister(); + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return 2; + else + return 1; + } + bool isVectorRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isVectorRegister() : + this->addr.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isCondRegister() : + this->addr.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.isScalarRegister() : + this->addr.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.opSize() : this->addr.opSize(); + } + int getRegisterIndex(int operandIndex) + { + assert(operandIndex >= 0 && operandIndex < getNumOperands()); + return !operandIndex ? src.regIndex() : this->addr.regIndex(); + } + }; + + + template<typename MemDataType, typename SrcDataType, + typename AddrOperandType> + class StInst : + public StInstBase<MemDataType, typename SrcDataType::OperandType, + AddrOperandType>, + public MemInst + { + public: + typename SrcDataType::OperandType::SrcOperand src_vect[4]; + uint16_t num_src_operands; + void generateDisassembly(); + + StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode, int srcIdx) + : StInstBase<MemDataType, typename SrcDataType::OperandType, + AddrOperandType>(ib, obj, _opcode), + MemInst(SrcDataType::memType) + { + init_addr(&this->addr); + + BrigRegOperandInfo rinfo; + unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx); + const Brig::BrigOperand *baseOp = obj->getOperand(op_offs); + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + const Brig::BrigOperandConstantBytes *op = + (Brig::BrigOperandConstantBytes*)baseOp; + + rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind, + Brig::BRIG_TYPE_NONE); + } else { + rinfo = findRegDataType(op_offs, obj); + } + + if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)baseOp; + + num_src_operands = + *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4; + + assert(num_src_operands <= 4); + } else { + num_src_operands = 1; + } + + if (num_src_operands > 1) { + assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST); + + for (int i = 0; i < num_src_operands; ++i) { + src_vect[i].init_from_vect(op_offs, obj, i); + } + } + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before performing a store, check if this store has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE) { + + gpuDynInst->statusBitVector = VectorMask(1); + gpuDynInst->execContinuation = &GPUStaticInst::execSt; + gpuDynInst->useContinuation = true; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, perform stores immediately + execSt(gpuDynInst); + } + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execSt may be called through a continuation + // if the store had release semantics. see comment for + // execSt in gpu_static_inst.hh + void + execSt(GPUDynInstPtr gpuDynInst) override + { + typedef typename MemDataType::CType c0; + + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + if (num_src_operands > 1) { + for (int i = 0; i < VSZ; ++i) + if (gpuDynInst->exec_mask[i]) + gpuDynInst->statusVector.push_back(num_src_operands); + else + gpuDynInst->statusVector.push_back(0); + } + + for (int k = 0; k < num_src_operands; ++k) { + c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); + + if (isLocalMem()) { + //store to shared memory + gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr, + *d); + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->dataStatic<c0>(d); + + // translation is performed in sendRequest() + // the request will be finished when the store completes + gpuDynInst->useContinuation = false; + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, + i, pkt); + + } + } + ++d; + } + } + + gpuDynInst->updateStats(); + } + + public: + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isVectorRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isVectorRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isVectorRegister(); + return false; + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isCondRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isCondRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isCondRegister(); + return false; + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.isScalarRegister(); + if (num_src_operands > 1) + return src_vect[operandIndex].isScalarRegister(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.isScalarRegister(); + return false; + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + return true; + } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.opSize(); + if (num_src_operands > 1) + return src_vect[operandIndex].opSize(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.opSize(); + return 0; + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex == num_src_operands) + return this->addr.regIndex(); + if (num_src_operands > 1) + return src_vect[operandIndex].regIndex(); + else if (num_src_operands == 1) + return StInstBase<MemDataType, + typename SrcDataType::OperandType, + AddrOperandType>::src.regIndex(); + return -1; + } + int getNumOperands() + { + if (this->addr.isVectorRegister() || this->addr.isScalarRegister()) + return num_src_operands + 1; + else + return num_src_operands; + } + void execute(GPUDynInstPtr gpuDynInst); + }; + + template<typename DataType, typename SrcDataType> + GPUStaticInst* + decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + int srcIdx = 0; + int destIdx = 1; + if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC || + ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) { + srcIdx = 1; + destIdx = 0; + } + unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return new StInst<DataType, SrcDataType, + NoRegAddrOperand>(ib, obj, "st", srcIdx); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return new StInst<DataType, SrcDataType, + SRegAddrOperand>(ib, obj, "st", srcIdx); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return new StInst<DataType, SrcDataType, + DRegAddrOperand>(ib, obj, "st", srcIdx); + default: + fatal("Bad st register operand type %d\n", tmp.type); + } + } else { + fatal("Bad st register operand kind %d\n", tmp.kind); + } + } + + Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode, + Brig::BrigAtomicOperation brigOp); + + template<typename OperandType, typename AddrOperandType, int NumSrcOperands, + bool HasDst> + class AtomicInstBase : public HsailGPUStaticInst + { + public: + typename OperandType::DestOperand dest; + typename OperandType::SrcOperand src[NumSrcOperands]; + AddrOperandType addr; + + Brig::BrigSegment segment; + Brig::BrigMemoryOrder memoryOrder; + Brig::BrigAtomicOperation atomicOperation; + Brig::BrigMemoryScope memoryScope; + Brig::BrigOpcode opcode; + Enums::MemOpType opType; + + AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) + { + using namespace Brig; + + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; + + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + atomicOperation = (BrigAtomicOperation)at->atomicOperation; + opcode = (BrigOpcode)ib->opcode; + opType = brigAtomicToMemOpType(opcode, atomicOperation); + + switch (segment) { + case BRIG_SEGMENT_GLOBAL: + o_type = Enums::OT_GLOBAL_ATOMIC; + break; + + case BRIG_SEGMENT_GROUP: + o_type = Enums::OT_SHARED_ATOMIC; + break; + + case BRIG_SEGMENT_FLAT: + o_type = Enums::OT_FLAT_ATOMIC; + break; + + default: + panic("Atomic: segment %d not supported\n", segment); + } + + if (HasDst) { + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + dest.init(op_offs, obj); + + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 2); + src[i].init(op_offs, obj); + } + } else { + + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); + + for (int i = 0; i < NumSrcOperands; ++i) { + op_offs = obj->getOperandPtr(ib->operands, i + 1); + src[i].init(op_offs, obj); + } + } + } + + int numSrcRegOperands() + { + int operands = 0; + for (int i = 0; i < NumSrcOperands; i++) { + if (src[i].isVectorRegister() == true) { + operands++; + } + } + if (addr.isVectorRegister()) + operands++; + return operands; + } + int numDstRegOperands() { return dest.isVectorRegister(); } + int getNumOperands() + { + if (addr.isVectorRegister()) + return(NumSrcOperands + 2); + return(NumSrcOperands + 1); + } + bool isVectorRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isVectorRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return dest.isVectorRegister(); + } + bool isCondRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isCondRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isCondRegister()); + else + return dest.isCondRegister(); + } + bool isScalarRegister(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return src[operandIndex].isScalarRegister(); + else if (operandIndex == NumSrcOperands) + return(addr.isScalarRegister()); + else + return dest.isScalarRegister(); + } + bool isSrcOperand(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return true; + else if (operandIndex == NumSrcOperands) + return(addr.isVectorRegister()); + else + return false; + } + bool isDstOperand(int operandIndex) + { + if (operandIndex <= NumSrcOperands) + return false; + else + return true; + } + int getOperandSize(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].opSize()); + else if (operandIndex == NumSrcOperands) + return(addr.opSize()); + else + return(dest.opSize()); + } + int getRegisterIndex(int operandIndex) + { + assert((operandIndex >= 0) && (operandIndex < getNumOperands())); + if (operandIndex < NumSrcOperands) + return(src[operandIndex].regIndex()); + else if (operandIndex == NumSrcOperands) + return(addr.regIndex()); + else + return(dest.regIndex()); + return -1; + } + }; + + template<typename MemDataType, typename AddrOperandType, int NumSrcOperands, + bool HasDst> + class AtomicInst : + public AtomicInstBase<typename MemDataType::OperandType, + AddrOperandType, NumSrcOperands, HasDst>, + public MemInst + { + public: + void generateDisassembly(); + + AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType, + NumSrcOperands, HasDst> + (ib, obj, _opcode), + MemInst(MemDataType::memType) + { + init_addr(&this->addr); + } + + void + initiateAcc(GPUDynInstPtr gpuDynInst) override + { + // before doing the RMW, check if this atomic has + // release semantics, and if so issue a release first + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) { + + gpuDynInst->statusBitVector = VectorMask(1); + + gpuDynInst->execContinuation = &GPUStaticInst::execAtomic; + gpuDynInst->useContinuation = true; + + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::RELEASE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + + return; + } + } + + // if there is no release semantic, execute the RMW immediately + execAtomic(gpuDynInst); + + } + + void execute(GPUDynInstPtr gpuDynInst); + + bool + isLocalMem() const override + { + return this->segment == Brig::BRIG_SEGMENT_GROUP; + } + + private: + // execAtomic may be called through a continuation + // if the RMW had release semantics. see comment for + // execContinuation in gpu_dyn_inst.hh + void + execAtomic(GPUDynInstPtr gpuDynInst) override + { + gpuDynInst->statusBitVector = gpuDynInst->exec_mask; + + typedef typename MemDataType::CType c0; + + c0 *d = &((c0*) gpuDynInst->d_data)[0]; + c0 *e = &((c0*) gpuDynInst->a_data)[0]; + c0 *f = &((c0*) gpuDynInst->x_data)[0]; + + for (int i = 0; i < VSZ; ++i) { + if (gpuDynInst->exec_mask[i]) { + Addr vaddr = gpuDynInst->addr[i]; + + if (isLocalMem()) { + Wavefront *wavefront = gpuDynInst->wavefront(); + *d = wavefront->ldsChunk->read<c0>(vaddr); + + switch (this->opType) { + case Enums::MO_AADD: + case Enums::MO_ANRADD: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) + (*e)); + break; + case Enums::MO_ASUB: + case Enums::MO_ANRSUB: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) - (*e)); + break; + case Enums::MO_AMAX: + case Enums::MO_ANRMAX: + wavefront->ldsChunk->write<c0>(vaddr, + std::max(wavefront->ldsChunk->read<c0>(vaddr), + (*e))); + break; + case Enums::MO_AMIN: + case Enums::MO_ANRMIN: + wavefront->ldsChunk->write<c0>(vaddr, + std::min(wavefront->ldsChunk->read<c0>(vaddr), + (*e))); + break; + case Enums::MO_AAND: + case Enums::MO_ANRAND: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) & (*e)); + break; + case Enums::MO_AOR: + case Enums::MO_ANROR: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) | (*e)); + break; + case Enums::MO_AXOR: + case Enums::MO_ANRXOR: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) ^ (*e)); + break; + case Enums::MO_AINC: + case Enums::MO_ANRINC: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) + 1); + break; + case Enums::MO_ADEC: + case Enums::MO_ANRDEC: + wavefront->ldsChunk->write<c0>(vaddr, + wavefront->ldsChunk->read<c0>(vaddr) - 1); + break; + case Enums::MO_AEXCH: + case Enums::MO_ANREXCH: + wavefront->ldsChunk->write<c0>(vaddr, (*e)); + break; + case Enums::MO_ACAS: + case Enums::MO_ANRCAS: + wavefront->ldsChunk->write<c0>(vaddr, + (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ? + (*f) : wavefront->ldsChunk->read<c0>(vaddr)); + break; + default: + fatal("Unrecognized or invalid HSAIL atomic op " + "type.\n"); + break; + } + } else { + Request *req = + new Request(0, vaddr, sizeof(c0), 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, i, + gpuDynInst->makeAtomicOpFunctor<c0>(e, + f, this->opType)); + + gpuDynInst->setRequestFlags(req); + PacketPtr pkt = new Packet(req, MemCmd::SwapReq); + pkt->dataStatic(d); + + if (gpuDynInst->computeUnit()->shader-> + separate_acquire_release && + (gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE)) { + // if this atomic has acquire semantics, + // schedule the continuation to perform an + // acquire after the RMW completes + gpuDynInst->execContinuation = + &GPUStaticInst::execAtomicAcq; + + gpuDynInst->useContinuation = true; + } else { + // the request will be finished when the RMW completes + gpuDynInst->useContinuation = false; + } + // translation is performed in sendRequest() + gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i, + pkt); + } + } + + ++d; + ++e; + ++f; + } + + gpuDynInst->updateStats(); + } + + // execAtomicACq will always be called through a continuation. + // see comment for execContinuation in gpu_dyn_inst.hh + void + execAtomicAcq(GPUDynInstPtr gpuDynInst) override + { + // after performing the RMW, check to see if this instruction + // has acquire semantics, and if so, issue an acquire + if (!isLocalMem()) { + if (gpuDynInst->computeUnit()->shader->separate_acquire_release + && gpuDynInst->memoryOrder == + Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->statusBitVector = VectorMask(1); + + // the request will be finished when + // the acquire completes + gpuDynInst->useContinuation = false; + // create request + Request *req = new Request(0, 0, 0, 0, + gpuDynInst->computeUnit()->masterId(), + 0, gpuDynInst->wfDynId, -1); + req->setFlags(Request::ACQUIRE); + gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req); + } + } + } + }; + + template<typename DataType, typename AddrOperandType, int NumSrcOperands> + GPUStaticInst* + constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) { + return decodeLd<DataType>(ib, obj); + } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) { + switch (ib->type) { + case Brig::BRIG_TYPE_B8: + return decodeSt<S8,S8>(ib, obj); + case Brig::BRIG_TYPE_B16: + return decodeSt<S8,S16>(ib, obj); + case Brig::BRIG_TYPE_B32: + return decodeSt<S8,S32>(ib, obj); + case Brig::BRIG_TYPE_B64: + return decodeSt<S8,S64>(ib, obj); + default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type); + } + } else { + if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) + return new AtomicInst<DataType, AddrOperandType, + NumSrcOperands, false>(ib, obj, "atomicnoret"); + else + return new AtomicInst<DataType, AddrOperandType, + NumSrcOperands, true>(ib, obj, "atomic"); + } + } + + template<typename DataType, int NumSrcOperands> + GPUStaticInst* + decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + unsigned addrIndex = (Brig::BrigOpcode)ib->opcode == + Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1; + + unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex); + + BrigRegOperandInfo tmp = findRegDataType(op_offs, obj); + + if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) { + return constructAtomic<DataType, NoRegAddrOperand, + NumSrcOperands>(ib, obj); + } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) { + // V2/V4 not allowed + switch (tmp.regKind) { + case Brig::BRIG_REGISTER_KIND_SINGLE: + return constructAtomic<DataType, SRegAddrOperand, + NumSrcOperands>(ib, obj); + case Brig::BRIG_REGISTER_KIND_DOUBLE: + return constructAtomic<DataType, DRegAddrOperand, + NumSrcOperands>(ib, obj); + default: + fatal("Bad atomic register operand type %d\n", tmp.type); + } + } else { + fatal("Bad atomic register operand kind %d\n", tmp.kind); + } + } + + + template<typename DataType> + GPUStaticInst* + decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper<DataType, 2>(ib, obj); + } else { + return decodeAtomicHelper<DataType, 1>(ib, obj); + } + } + + template<typename DataType> + GPUStaticInst* + decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj) + { + const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib; + if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) { + return decodeAtomicHelper<DataType, 2>(ib, obj); + } else { + return decodeAtomicHelper<DataType, 1>(ib, obj); + } + } +} // namespace HsailISA + +#endif // __ARCH_HSAIL_INSTS_MEM_HH__ diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh new file mode 100644 index 000000000..94f0cd6aa --- /dev/null +++ b/src/arch/hsail/insts/mem_impl.hh @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/generic_types.hh" +#include "gpu-compute/hsail_code.hh" + +// defined in code.cc, but not worth sucking in all of code.h for this +// at this point +extern const char *segmentNames[]; + +namespace HsailISA +{ + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly() + { + this->disassembly = csprintf("%s_%s %s,%s", this->opcode, + DestDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + } + + template<typename DestDataType, typename AddrRegOperandType> + void + LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename DestDataType::CType CType M5_VAR_USED; + const VectorMask &mask = w->get_pred(); + uint64_t addr_vec[VSZ]; + this->addr.calcVector(w, addr_vec); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, addr_vec[lane]); + } + } + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly() + { + switch (num_dest_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + MemDataType::label, + this->dest_vect[0].disassemble(), + this->dest_vect[1].disassemble(), + this->dest_vect[2].disassemble(), + this->dest_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: + fatal("Bad ld register dest operand, num vector operands: %d \n", + num_dest_operands); + break; + } + } + + static Addr + calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i) + { + // what is the size of the object we are accessing?? + // NOTE: the compiler doesn't generate enough information + // to do this yet..have to just line up all the private + // work-item spaces back to back for now + /* + StorageElement* se = + i->parent->findSymbol(Brig::BrigPrivateSpace, addr); + assert(se); + + return w->wfSlotId * w->privSizePerItem * VSZ + + se->offset * VSZ + + lane * se->size; + */ + + // addressing strategy: interleave the private spaces of + // work-items in a wave-front on 8 byte granularity. + // this won't be perfect coalescing like the spill space + // strategy, but it's better than nothing. The spill space + // strategy won't work with private because the same address + // may be accessed by different sized loads/stores. + + // Note: I'm assuming that the largest load/store to private + // is 8 bytes. If it is larger, the stride will have to increase + + Addr addr_div8 = addr / 8; + Addr addr_mod8 = addr % 8; + + Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase; + + assert(ret < w->privBase + (w->privSizePerItem * VSZ)); + + return ret; + } + + template<typename MemDataType, typename DestDataType, + typename AddrRegOperandType> + void + LdInst<MemDataType, DestDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename MemDataType::CType MemCType; + const VectorMask &mask = w->get_pred(); + + // Kernarg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_KERNARG) { + MemCType val; + + // I assume no vector ld for kernargs + assert(num_dest_operands == 1); + + // assuming for the moment that we'll never do register + // offsets into kernarg space... just to make life simpler + uint64_t address = this->addr.calcUniform(); + + val = *(MemCType*)&w->kernelArgs[address]; + + DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + this->dest.set(w, lane, val); + } + } + + return; + } else if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + MemCType val = w->readCallArgMem<MemCType>(lane, address); + + DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address, + (unsigned long long)val); + + this->dest.set(w, lane, val); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + m->m_op = Enums::MO_LD; + m->m_type = MemDataType::memType; + m->v_type = DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = this->equivClass; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (num_dest_operands == 1) { + m->dst_reg = this->dest.regIndex(); + m->n_reg = 1; + } else { + m->n_reg = num_dest_operands; + for (int i = 0; i < num_dest_operands; ++i) { + m->dst_reg_vec[i] = this->dest_vect[i].regIndex(); + } + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (m->addr[lane] < w->privSizePerItem) { + if (mask[lane]) { + // what is the size of the object we are accessing? + // find base for for this wavefront + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(MemCType) - 1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_dest_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + // note: this calculation will NOT WORK if the compiler + // ever generates loads/stores to the same address with + // different widths (e.g., a ld_u32 addr and a ld_u16 addr) + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(MemCType) + w->spillBase; + + w->last_addr[lane] = m->addr[lane]; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_READONLY: + m->s_type = SEG_READONLY; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] + sizeof(MemCType) <= w->roSize); + m->addr[lane] += w->roBase; + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + + m->addr[lane] = m->addr[lane] + + lane * sizeof(MemCType) + w->privBase; + } + } + } + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + default: + fatal("Load to unsupported segment %d %llxe\n", this->segment, + m->addr[0]); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *w = gpuDynInst->wavefront(); + + typedef typename OperationType::CType CType; + + const VectorMask &mask = w->get_pred(); + + // arg references are handled uniquely for now (no Memory Request + // is used), so special-case them up front. Someday we should + // make this more realistic, at which we should get rid of this + // block and fold this case into the switch below. + if (this->segment == Brig::BRIG_SEGMENT_ARG) { + uint64_t address = this->addr.calcUniform(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + CType data = this->src.template get<CType>(w, lane); + DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data); + w->writeCallArgMem<CType>(lane, address, data); + } + } + + return; + } + + GPUDynInstPtr m = gpuDynInst; + + m->exec_mask = w->execMask(); + + this->addr.calcVector(w, m->addr); + + if (num_src_operands == 1) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[lane] = + this->src.template get<CType>(w, lane); + } + } + } else { + for (int k= 0; k < num_src_operands; ++k) { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + ((CType*)m->d_data)[k * VSZ + lane] = + this->src_vect[k].template get<CType>(w, lane); + } + } + } + } + + m->m_op = Enums::MO_ST; + m->m_type = OperationType::memType; + m->v_type = OperationType::vgprType; + + m->statusBitVector = 0; + m->equiv = this->equivClass; + + if (num_src_operands == 1) { + m->n_reg = 1; + } else { + m->n_reg = num_src_operands; + } + + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + + // this is a complete hack to get around a compiler bug + // (the compiler currently generates global access for private + // addresses (starting from 0). We need to add the private offset) + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + if (m->addr[lane] < w->privSizePerItem) { + + // calcPrivAddr will fail if accesses are unaligned + assert(!((sizeof(CType)-1) & m->addr[lane])); + + Addr privAddr = calcPrivAddr(m->addr[lane], w, lane, + this); + + m->addr[lane] = privAddr; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_SPILL: + assert(num_src_operands == 1); + m->s_type = SEG_SPILL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->spillSizePerItem); + + m->addr[lane] = m->addr[lane] * w->spillWidth + + lane * sizeof(CType) + w->spillBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_PRIVATE: + m->s_type = SEG_PRIVATE; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + { + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + assert(m->addr[lane] < w->privSizePerItem); + m->addr[lane] = m->addr[lane] + lane * + sizeof(CType)+w->privBase; + } + } + } + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + break; + + default: + fatal("Store to unsupported segment %d\n", this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + template<typename OperationType, typename SrcDataType, + typename AddrRegOperandType> + void + StInst<OperationType, SrcDataType, + AddrRegOperandType>::generateDisassembly() + { + switch (num_src_operands) { + case 1: + this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src.disassemble(), + this->addr.disassemble()); + break; + case 2: + this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->addr.disassemble()); + break; + case 4: + this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s", + this->opcode, + segmentNames[this->segment], + OperationType::label, + this->src_vect[0].disassemble(), + this->src_vect[1].disassemble(), + this->src_vect[2].disassemble(), + this->src_vect[3].disassemble(), + this->addr.disassemble()); + break; + default: fatal("Bad ld register src operand, num vector operands: " + "%d \n", num_src_operands); + break; + } + } + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::execute(GPUDynInstPtr gpuDynInst) + { + typedef typename DataType::CType CType; + + Wavefront *w = gpuDynInst->wavefront(); + + GPUDynInstPtr m = gpuDynInst; + + this->addr.calcVector(w, m->addr); + + for (int lane = 0; lane < VSZ; ++lane) { + ((CType *)m->a_data)[lane] = + this->src[0].template get<CType>(w, lane); + } + + // load second source operand for CAS + if (NumSrcOperands > 1) { + for (int lane = 0; lane < VSZ; ++lane) { + ((CType*)m->x_data)[lane] = + this->src[1].template get<CType>(w, lane); + } + } + + assert(NumSrcOperands <= 2); + + m->m_op = this->opType; + m->m_type = DataType::memType; + m->v_type = DataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); + + m->scope = getGenericMemoryScope(this->memoryScope); + + if (HasDst) { + m->dst_reg = this->dest.regIndex(); + } + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->kern_id = w->kern_id; + m->cu_id = w->computeUnit->cu_id; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + switch (this->segment) { + case Brig::BRIG_SEGMENT_GLOBAL: + m->s_type = SEG_GLOBAL; + m->latency.set(w->computeUnit->shader->ticks(64)); + m->pipeId = GLBMEM_PIPE; + + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + break; + + case Brig::BRIG_SEGMENT_GROUP: + m->s_type = SEG_SHARED; + m->pipeId = LDSMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(24)); + w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); + w->outstanding_reqs_wr_lm++; + w->wr_lm_reqs_in_pipe--; + w->outstanding_reqs_rd_lm++; + w->rd_lm_reqs_in_pipe--; + break; + + default: + fatal("Atomic op to unsupported segment %d\n", + this->segment); + } + + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp); + + template<typename DataType, typename AddrRegOperandType, int NumSrcOperands, + bool HasDst> + void + AtomicInst<DataType, AddrRegOperandType, NumSrcOperands, + HasDst>::generateDisassembly() + { + if (HasDst) { + this->disassembly = + csprintf("%s_%s_%s_%s %s,%s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->dest.disassemble(), + this->addr.disassemble()); + } else { + this->disassembly = + csprintf("%s_%s_%s_%s %s", this->opcode, + atomicOpToString(this->atomicOperation), + segmentNames[this->segment], + DataType::label, this->addr.disassemble()); + } + + for (int i = 0; i < NumSrcOperands; ++i) { + this->disassembly += ","; + this->disassembly += this->src[i].disassemble(); + } + } +} // namespace HsailISA diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc new file mode 100644 index 000000000..9506a80ab --- /dev/null +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -0,0 +1,787 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Marc Orr + */ + +#include <csignal> + +#include "arch/hsail/insts/decl.hh" +#include "arch/hsail/insts/mem.hh" + +namespace HsailISA +{ + // Pseudo (or magic) instructions are overloaded on the hsail call + // instruction, because of its flexible parameter signature. + + // To add a new magic instruction: + // 1. Add an entry to the enum. + // 2. Implement it in the switch statement below (Call::exec). + // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h, + // so its easy to call from an OpenCL kernel. + + // This enum should be identical to the enum in + // hsa/hsail-gpu-compute/util/magicinst.h + enum + { + MAGIC_PRINT_WF_32 = 0, + MAGIC_PRINT_WF_64, + MAGIC_PRINT_LANE, + MAGIC_PRINT_LANE_64, + MAGIC_PRINT_WF_FLOAT, + MAGIC_SIM_BREAK, + MAGIC_PREF_SUM, + MAGIC_REDUCTION, + MAGIC_MASKLANE_LOWER, + MAGIC_MASKLANE_UPPER, + MAGIC_JOIN_WF_BAR, + MAGIC_WAIT_WF_BAR, + MAGIC_PANIC, + MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG, + MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG, + MAGIC_LOAD_GLOBAL_U32_REG, + MAGIC_XACT_CAS_LD, + MAGIC_MOST_SIG_THD, + MAGIC_MOST_SIG_BROADCAST, + MAGIC_PRINT_WFID_32, + MAGIC_PRINT_WFID_64 + }; + + void + Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + const VectorMask &mask = w->get_pred(); + + int op = 0; + bool got_op = false; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val0 = src1.get<int>(w, lane, 0); + if (got_op) { + if (src_val0 != op) { + fatal("Multiple magic instructions per PC not " + "supported\n"); + } + } else { + op = src_val0; + got_op = true; + } + } + } + + switch(op) { + case MAGIC_PRINT_WF_32: + MagicPrintWF32(w); + break; + case MAGIC_PRINT_WF_64: + MagicPrintWF64(w); + break; + case MAGIC_PRINT_LANE: + MagicPrintLane(w); + break; + case MAGIC_PRINT_LANE_64: + MagicPrintLane64(w); + break; + case MAGIC_PRINT_WF_FLOAT: + MagicPrintWFFloat(w); + break; + case MAGIC_SIM_BREAK: + MagicSimBreak(w); + break; + case MAGIC_PREF_SUM: + MagicPrefixSum(w); + break; + case MAGIC_REDUCTION: + MagicReduction(w); + break; + case MAGIC_MASKLANE_LOWER: + MagicMaskLower(w); + break; + case MAGIC_MASKLANE_UPPER: + MagicMaskUpper(w); + break; + case MAGIC_JOIN_WF_BAR: + MagicJoinWFBar(w); + break; + case MAGIC_WAIT_WF_BAR: + MagicWaitWFBar(w); + break; + case MAGIC_PANIC: + MagicPanic(w); + break; + + // atomic instructions + case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG: + MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG: + MagicAtomicNRAddGroupU32Reg(w, gpuDynInst); + break; + + case MAGIC_LOAD_GLOBAL_U32_REG: + MagicLoadGlobalU32Reg(w, gpuDynInst); + break; + + case MAGIC_XACT_CAS_LD: + MagicXactCasLd(w); + break; + + case MAGIC_MOST_SIG_THD: + MagicMostSigThread(w); + break; + + case MAGIC_MOST_SIG_BROADCAST: + MagicMostSigBroadcast(w); + break; + + case MAGIC_PRINT_WFID_32: + MagicPrintWF32ID(w); + break; + + case MAGIC_PRINT_WFID_64: + MagicPrintWFID64(w); + break; + + default: fatal("unrecognized magic instruction: %d\n", op); + } + } + + void + Call::MagicPrintLane(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintLane64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + if (src_val2) { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } else { + DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n", + disassemble(), w->computeUnit->cu_id, w->simdId, + w->wfSlotId, lane, src_val1); + } + } + } + #endif + } + + void + Call::MagicPrintWF32(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWF32ID(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%08x", src_val1); + } else { + res_str += csprintf("%08d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWF64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + void + Call::MagicPrintWFID64(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + int src_val3 = -1; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 3)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + int64_t src_val1 = src1.get<int64_t>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + src_val3 = src1.get<int>(w, lane, 3); + + if (src_val2) { + res_str += csprintf("%016x", src_val1); + } else { + res_str += csprintf("%016d", src_val1); + } + } else { + res_str += csprintf("xxxxxxxxxxxxxxxx"); + } + + if ((lane & 3) == 3) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + if (w->wfDynId == src_val3) { + DPRINTFN(res_str.c_str()); + } + #endif + } + + void + Call::MagicPrintWFFloat(Wavefront *w) + { + #if TRACING_ON + const VectorMask &mask = w->get_pred(); + std::string res_str; + res_str = csprintf("krl_prt (%s)\n", disassemble()); + + for (int lane = 0; lane < VSZ; ++lane) { + if (!(lane & 7)) { + res_str += csprintf("DB%03d: ", (int)w->wfDynId); + } + + if (mask[lane]) { + float src_val1 = src1.get<float>(w, lane, 1); + res_str += csprintf("%08f", src_val1); + } else { + res_str += csprintf("xxxxxxxx"); + } + + if ((lane & 7) == 7) { + res_str += csprintf("\n"); + } else { + res_str += csprintf(" "); + } + } + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + #endif + } + + // raises a signal that GDB will catch + // when done with the break, type "signal 0" in gdb to continue + void + Call::MagicSimBreak(Wavefront *w) + { + std::string res_str; + // print out state for this wavefront and then break + res_str = csprintf("Breakpoint encountered for wavefront %i\n", + w->wfSlotId); + + res_str += csprintf(" Kern ID: %i\n", w->kern_id); + res_str += csprintf(" Phase ID: %i\n", w->simdId); + res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id); + res_str += csprintf(" Exec mask: "); + + for (int i = VSZ - 1; i >= 0; --i) { + if (w->execMask(i)) + res_str += "1"; + else + res_str += "0"; + + if ((i & 7) == 7) + res_str += " "; + } + + res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong()); + + res_str += "\nHelpful debugging hints:\n"; + res_str += " Check out w->s_reg / w->d_reg for register state\n"; + + res_str += "\n\n"; + DPRINTFN(res_str.c_str()); + fflush(stdout); + + raise(SIGTRAP); + } + + void + Call::MagicPrefixSum(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + dest.set<int>(w, lane, res); + res += src_val1; + } + } + } + + void + Call::MagicReduction(Wavefront *w) + { + // reduction magic instruction + // The reduction instruction takes up to 64 inputs (one from + // each thread in a WF) and sums them. It returns the sum to + // each thread in the WF. + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + res += src_val1; + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskLower(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane < (VSZ/2)) { + res = res | ((uint32_t)(1) << lane); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicMaskUpper(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + + if (src_val1) { + if (lane >= (VSZ/2)) { + res = res | ((uint32_t)(1) << (lane - (VSZ/2))); + } + } + } + } + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + dest.set<int>(w, lane, res); + } + } + } + + void + Call::MagicJoinWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]++; + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + } + + if (max_cnt > w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + } + + void + Call::MagicWaitWFBar(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int max_cnt = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + w->bar_cnt[lane]--; + } + + if (w->bar_cnt[lane] > max_cnt) { + max_cnt = w->bar_cnt[lane]; + } + } + + if (max_cnt < w->max_bar_cnt) { + w->max_bar_cnt = max_cnt; + } + + w->instructionBuffer.erase(w->instructionBuffer.begin() + 1, + w->instructionBuffer.end()); + if (w->pendingFetch) + w->dropFetch = true; + } + + void + Call::MagicPanic(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + int src_val1 = src1.get<int>(w, lane, 1); + panic("OpenCL Code failed assertion #%d. Triggered by lane %s", + src_val1, lane); + } + } + } + + void + Call::calcAddr(Wavefront *w, GPUDynInstPtr m) + { + // the address is in src1 | src2 + for (int lane = 0; lane < VSZ; ++lane) { + int src_val1 = src1.get<int>(w, lane, 1); + int src_val2 = src1.get<int>(w, lane, 2); + Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2); + + m->addr[lane] = addr; + } + + } + + void + Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + calcAddr(w, m); + + for (int lane = 0; lane < VSZ; ++lane) { + ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1); + } + + m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, + Brig::BRIG_ATOMIC_ADD); + m->m_type = U32::memType; + m->v_type = U32::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; // atomics don't have an equivalence class operand + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(64)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_wr_gm++; + w->wr_gm_reqs_in_pipe--; + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst) + { + GPUDynInstPtr m = gpuDynInst; + // calculate the address + calcAddr(w, m); + + m->m_op = Enums::MO_LD; + m->m_type = U32::memType; //MemDataType::memType; + m->v_type = U32::vgprType; //DestDataType::vgprType; + + m->exec_mask = w->execMask(); + m->statusBitVector = 0; + m->equiv = 0; + m->n_reg = 1; + m->memoryOrder = Enums::MEMORY_ORDER_NONE; + m->scope = Enums::MEMORY_SCOPE_NONE; + + // FIXME + //m->dst_reg = this->dest.regIndex(); + + m->simdId = w->simdId; + m->wfSlotId = w->wfSlotId; + m->wfDynId = w->wfDynId; + m->latency.init(&w->computeUnit->shader->tick_cnt); + + m->s_type = SEG_GLOBAL; + m->pipeId = GLBMEM_PIPE; + m->latency.set(w->computeUnit->shader->ticks(1)); + w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->outstanding_reqs_rd_gm++; + w->rd_gm_reqs_in_pipe--; + w->outstanding_reqs++; + w->mem_reqs_in_pipe--; + } + + void + Call::MagicXactCasLd(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int src_val1 = 0; + + for (int lane = 0; lane < VSZ; ++lane) { + if (mask[lane]) { + src_val1 = src1.get<int>(w, lane, 1); + break; + } + } + + if (!w->computeUnit->xactCasLoadMap.count(src_val1)) { + w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue(); + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear(); + } + + w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue + .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId)); + } + + void + Call::MagicMostSigThread(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + unsigned mst = true; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + dest.set<int>(w, lane, mst); + mst = false; + } + } + } + + void + Call::MagicMostSigBroadcast(Wavefront *w) + { + const VectorMask &mask = w->get_pred(); + int res = 0; + bool got_res = false; + + for (int lane = VSZ - 1; lane >= 0; --lane) { + if (mask[lane]) { + if (!got_res) { + res = src1.get<int>(w, lane, 1); + got_res = true; + } + dest.set<int>(w, lane, res); + } + } + } + +} // namespace HsailISA diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc new file mode 100644 index 000000000..d0e6c5541 --- /dev/null +++ b/src/arch/hsail/operand.cc @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "arch/hsail/operand.hh" + +using namespace Brig; + +bool +BaseRegOperand::init(unsigned opOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar) +{ + regFileChar = _regFileChar; + const BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER) + return false; + + const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp; + + regIdx = brigRegOp->regNum; + + DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx, + brigRegOp->regKind); + + maxRegIdx = std::max(maxRegIdx, regIdx); + + return true; +} + +void +ListOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset); + + switch (brigOp->kind) { + case BRIG_KIND_OPERAND_CODE_LIST: + { + const BrigOperandCodeList *opList = + (const BrigOperandCodeList*)brigOp; + + const Brig::BrigData *oprnd_data = + obj->getBrigBaseData(opList->elements); + + // Note: for calls Dest list of operands could be size of 0. + elementCount = oprnd_data->byteCount / 4; + + DPRINTF(GPUReg, "Operand Code List: # elements: %d\n", + elementCount); + + for (int i = 0; i < elementCount; ++i) { + unsigned *data_offset = + (unsigned*)obj->getData(opList->elements + 4 * (i + 1)); + + const BrigDirectiveVariable *p = + (const BrigDirectiveVariable*)obj-> + getCodeSectionEntry(*data_offset); + + StorageElement *se = obj->currentCode->storageMap-> + findSymbol(BRIG_SEGMENT_ARG, p); + + assert(se); + callArgs.push_back(se); + } + } + break; + default: + fatal("ListOperand: bad operand kind %d\n", brigOp->kind); + } +} + +std::string +ListOperand::disassemble() +{ + std::string res_str(""); + + for (auto it : callArgs) { + res_str += csprintf("%s ", it->name.c_str()); + } + + return res_str; +} + +void +FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) { + fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind); + } + + const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp; + + const BrigDirectiveExecutable *p = + (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref); + + func_name = obj->getString(p->name); +} + +std::string +FunctionRefOperand::disassemble() +{ + DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name); + + return csprintf("%s", func_name); +} + +bool +BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj, + int at, unsigned &maxRegIdx, char _regFileChar) +{ + regFileChar = _regFileChar; + const BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST) + return false; + + + const Brig::BrigOperandOperandList *brigRegVecOp = + (const Brig::BrigOperandOperandList*)brigOp; + + unsigned *data_offset = + (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1)); + + const BrigOperand *p = + (const BrigOperand*)obj->getOperand(*data_offset); + if (p->kind != BRIG_KIND_OPERAND_REGISTER) { + return false; + } + + const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p; + + regIdx = brigRegOp->regNum; + + DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx, + brigRegOp->regKind); + + maxRegIdx = std::max(maxRegIdx, regIdx); + + return true; +} + +void +BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar) +{ + const char *name = obj->getString(strOffset); + char *endptr; + regIdx = strtoul(name + 2, &endptr, 10); + + if (name[0] != '$' || name[1] != _regFileChar) { + fatal("register operand parse error on \"%s\"\n", name); + } + + maxRegIdx = std::max(maxRegIdx, regIdx); +} + +unsigned SRegOperand::maxRegIdx; +unsigned DRegOperand::maxRegIdx; +unsigned CRegOperand::maxRegIdx; + +std::string +SRegOperand::disassemble() +{ + return csprintf("$s%d", regIdx); +} + +std::string +DRegOperand::disassemble() +{ + return csprintf("$d%d", regIdx); +} + +std::string +CRegOperand::disassemble() +{ + return csprintf("$c%d", regIdx); +} + +BrigRegOperandInfo +findRegDataType(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + switch (baseOp->kind) { + case BRIG_KIND_OPERAND_REGISTER: + { + const BrigOperandRegister *op = (BrigOperandRegister*)baseOp; + + return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, + (BrigRegisterKind)op->regKind); + } + break; + + case BRIG_KIND_OPERAND_OPERAND_LIST: + { + const BrigOperandOperandList *op = + (BrigOperandOperandList*)baseOp; + const BrigData *data_p = (BrigData*)obj->getData(op->elements); + + + int num_operands = 0; + BrigRegisterKind reg_kind = (BrigRegisterKind)0; + for (int offset = 0; offset < data_p->byteCount; offset += 4) { + const BrigOperand *op_p = (const BrigOperand *) + obj->getOperand(((int *)data_p->bytes)[offset/4]); + + if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) { + const BrigOperandRegister *brigRegOp = + (const BrigOperandRegister*)op_p; + reg_kind = (BrigRegisterKind)brigRegOp->regKind; + } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) { + uint16_t num_bytes = + ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount + - sizeof(BrigBase); + if (num_bytes == sizeof(uint32_t)) { + reg_kind = BRIG_REGISTER_KIND_SINGLE; + } else if (num_bytes == sizeof(uint64_t)) { + reg_kind = BRIG_REGISTER_KIND_DOUBLE; + } else { + fatal("OperandList: bad operand size %d\n", num_bytes); + } + } else { + fatal("OperandList: bad operand kind %d\n", op_p->kind); + } + + num_operands++; + } + assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST); + + return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind); + } + break; + + case BRIG_KIND_OPERAND_ADDRESS: + { + const BrigOperandAddress *op = (BrigOperandAddress*)baseOp; + + if (!op->reg) { + BrigType type = BRIG_TYPE_NONE; + + if (op->symbol) { + const BrigDirective *dir = (BrigDirective*) + obj->getCodeSectionEntry(op->symbol); + + assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE); + + const BrigDirectiveVariable *sym = + (const BrigDirectiveVariable*)dir; + + type = (BrigType)sym->type; + } + return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS, + (BrigType)type); + } else { + const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp; + const BrigOperand *reg = obj->getOperand(b->reg); + const BrigOperandRegister *rop = (BrigOperandRegister*)reg; + + return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER, + (BrigRegisterKind)rop->regKind); + } + } + break; + + default: + fatal("AddrOperand: bad operand kind %d\n", baseOp->kind); + break; + } +} + +void +AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj) +{ + assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS); + + const BrigDirective *d = + (BrigDirective*)obj->getCodeSectionEntry(op->symbol); + + assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE); + const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d; + name = obj->getString(sym->name); + + if (sym->segment != BRIG_SEGMENT_ARG) { + storageElement = + obj->currentCode->storageMap->findSymbol(sym->segment, name); + assert(storageElement); + offset = 0; + } else { + // sym->name does not work for BRIG_SEGMENT_ARG for the following case: + // + // void foo(int a); + // void bar(double a); + // + // foo(...) --> arg_u32 %param_p0; + // st_arg_u32 $s0, [%param_p0]; + // call &foo (%param_p0); + // bar(...) --> arg_f64 %param_p0; + // st_arg_u64 $d0, [%param_p0]; + // call &foo (%param_p0); + // + // Both functions use the same variable name (param_p0)!!! + // + // Maybe this is a bug in the compiler (I don't know). + // + // Solution: + // Use directive pointer (BrigDirectiveVariable) to differentiate 2 + // versions of param_p0. + // + // Note this solution is kind of stupid, because we are pulling stuff + // out of the brig binary via the directive pointer and putting it into + // the symbol table, but now we are indexing the symbol table by the + // brig directive pointer! It makes the symbol table sort of pointless. + // But I don't want to mess with the rest of the infrastructure, so + // let's go with this for now. + // + // When we update the compiler again, we should see if this problem goes + // away. If so, we can fold some of this functionality into the code for + // kernel arguments. If not, maybe we can index the symbol name on a + // hash of the variable AND function name + storageElement = obj->currentCode-> + storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym); + + assert(storageElement); + } +} + +uint64_t +AddrOperandBase::calcUniformBase() +{ + // start with offset, will be 0 if not specified + uint64_t address = offset; + + // add in symbol value if specified + if (storageElement) { + address += storageElement->offset; + } + + return address; +} + +std::string +AddrOperandBase::disassemble(std::string reg_disassembly) +{ + std::string disasm; + + if (offset || reg_disassembly != "") { + disasm += "["; + + if (reg_disassembly != "") { + disasm += reg_disassembly; + + if (offset > 0) { + disasm += "+"; + } + } + + if (offset) { + disasm += csprintf("%d", offset); + } + + disasm += "]"; + } else if (name) { + disasm += csprintf("[%s]", name); + } + + return disasm; +} + +void +NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperand *baseOp = obj->getOperand(opOffset); + + if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) { + BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp; + parseAddr(addrOp, obj); + offset = (uint64_t(addrOp->offset.hi) << 32) | + uint64_t(addrOp->offset.lo); + } else { + fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind); + } + +} + +std::string +NoRegAddrOperand::disassemble() +{ + return AddrOperandBase::disassemble(std::string("")); +} + +void +LabelOperand::init(unsigned opOffset, const BrigObject *obj) +{ + const BrigOperandCodeRef *op = + (const BrigOperandCodeRef*)obj->getOperand(opOffset); + + assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF); + + const BrigDirective *dir = + (const BrigDirective*)obj->getCodeSectionEntry(op->ref); + + assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL); + label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj); +} + +uint32_t +LabelOperand::getTarget(Wavefront *w, int lane) +{ + return label->get(); +} + +std::string +LabelOperand::disassemble() +{ + return label->name; +} diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh new file mode 100644 index 000000000..e3d275b10 --- /dev/null +++ b/src/arch/hsail/operand.hh @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __ARCH_HSAIL_OPERAND_HH__ +#define __ARCH_HSAIL_OPERAND_HH__ + +/** + * @file operand.hh + * + * Defines classes encapsulating HSAIL instruction operands. + */ + +#include <string> + +#include "arch/hsail/Brig.h" +#include "base/trace.hh" +#include "base/types.hh" +#include "debug/GPUReg.hh" +#include "enums/RegisterType.hh" +#include "gpu-compute/brig_object.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/hsail_code.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +class Label; +class StorageElement; + +class BaseOperand +{ + public: + Enums::RegisterType registerType; + uint32_t regOperandSize; + BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; } + bool isVectorRegister() { return registerType == Enums::RT_VECTOR; } + bool isScalarRegister() { return registerType == Enums::RT_SCALAR; } + bool isCondRegister() { return registerType == Enums::RT_CONDITION; } + unsigned int regIndex() { return 0; } + uint32_t opSize() { return regOperandSize; } + virtual ~BaseOperand() { } +}; + +class BrigRegOperandInfo +{ + public: + Brig::BrigKind16_t kind; + Brig::BrigType type; + Brig::BrigRegisterKind regKind; + + BrigRegOperandInfo(Brig::BrigKind16_t _kind, + Brig::BrigRegisterKind _regKind) + : kind(_kind), regKind(_regKind) + { + } + + BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type) + : kind(_kind), type(_type) + { + } + + BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES), + type(Brig::BRIG_TYPE_NONE) + { + } +}; + +BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj); + +class BaseRegOperand : public BaseOperand +{ + public: + unsigned regIdx; + char regFileChar; + + bool init(unsigned opOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar); + + bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at, + unsigned &maxRegIdx, char _regFileChar); + + void initWithStrOffset(unsigned strOffset, const BrigObject *obj, + unsigned &maxRegIdx, char _regFileChar); + unsigned int regIndex() { return regIdx; } +}; + +class SRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 's'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 's'); + } + + template<typename OperandType> + OperandType + get(Wavefront *w, int lane) + { + assert(sizeof(OperandType) <= sizeof(uint32_t)); + assert(regIdx < w->maxSpVgprs); + // if OperandType is smaller than 32-bit, we truncate the value + OperandType ret; + uint32_t vgprIdx; + + switch (sizeof(OperandType)) { + case 1: // 1 byte operand + vgprIdx = w->remap(regIdx, 1, 1); + ret = (w->computeUnit->vrf[w->simdId]-> + read<uint32_t>(vgprIdx, lane)) & 0xff; + break; + case 2: // 2 byte operand + vgprIdx = w->remap(regIdx, 2, 1); + ret = (w->computeUnit->vrf[w->simdId]-> + read<uint32_t>(vgprIdx, lane)) & 0xffff; + break; + case 4: // 4 byte operand + vgprIdx = w->remap(regIdx,sizeof(OperandType), 1); + ret = w->computeUnit->vrf[w->simdId]-> + read<OperandType>(vgprIdx, lane); + break; + default: + panic("Bad OperandType\n"); + break; + } + + return (OperandType)ret; + } + + // special get method for compatibility with LabelOperand + uint32_t + getTarget(Wavefront *w, int lane) + { + return get<uint32_t>(w, lane); + } + + template<typename OperandType> + void set(Wavefront *w, int lane, OperandType &val); + std::string disassemble(); +}; + +template<typename OperandType> +void +SRegOperand::set(Wavefront *w, int lane, OperandType &val) +{ + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val); + + assert(sizeof(OperandType) == sizeof(uint32_t)); + assert(regIdx < w->maxSpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane); +} + +template<> +inline void +SRegOperand::set(Wavefront *w, int lane, uint64_t &val) +{ + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val); + + assert(regIdx < w->maxSpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1); + w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane); +} + +class DRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 'd'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 'd'); + } + + template<typename OperandType> + OperandType + get(Wavefront *w, int lane) + { + assert(sizeof(OperandType) <= sizeof(uint64_t)); + // TODO: this check is valid only for HSAIL + assert(regIdx < w->maxDpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + + return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane); + } + + template<typename OperandType> + void + set(Wavefront *w, int lane, OperandType &val) + { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, + val); + + assert(sizeof(OperandType) <= sizeof(uint64_t)); + // TODO: this check is valid only for HSAIL + assert(regIdx < w->maxDpVgprs); + uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1); + w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane); + } + + std::string disassemble(); +}; + +class CRegOperand : public BaseRegOperand +{ + public: + static unsigned maxRegIdx; + + bool + init(unsigned opOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c'); + } + + bool + init_from_vect(unsigned opOffset, const BrigObject *obj, int at) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx, + 'c'); + } + + void + initWithStrOffset(unsigned strOffset, const BrigObject *obj) + { + regOperandSize = sizeof(uint8_t); + registerType = Enums::RT_CONDITION; + + return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx, + 'c'); + } + + template<typename OperandType> + OperandType + get(Wavefront *w, int lane) + { + assert(regIdx < w->condRegState->numRegs()); + + return w->condRegState->read<OperandType>((int)regIdx, lane); + } + + template<typename OperandType> + void + set(Wavefront *w, int lane, OperandType &val) + { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, + val); + + assert(regIdx < w->condRegState->numRegs()); + w->condRegState->write<OperandType>(regIdx,lane,val); + } + + std::string disassemble(); +}; + +template<typename T> +class ImmOperand : public BaseOperand +{ + public: + T bits; + + bool init(unsigned opOffset, const BrigObject *obj); + bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at); + std::string disassemble(); + + template<typename OperandType> + OperandType + get() + { + assert(sizeof(OperandType) <= sizeof(T)); + + return *(OperandType*)&bits; + } + + // This version of get() takes a WF* and a lane id for + // compatibility with the register-based get() methods. + template<typename OperandType> + OperandType + get(Wavefront *w, int lane) + { + return get<OperandType>(); + } +}; + +template<typename T> +bool +ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj) +{ + const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); + + switch (brigOp->kind) { + // this is immediate operand + case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES: + { + DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T), + brigOp->byteCount); + + auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp; + + bits = *((T*)(obj->getData(cbptr->bytes + 4))); + + return true; + } + break; + + case Brig::BRIG_KIND_OPERAND_WAVESIZE: + bits = VSZ; + return true; + + default: + return false; + } +} + +template <typename T> +bool +ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at) +{ + const Brig::BrigOperand *brigOp = obj->getOperand(opOffset); + + if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) { + return false; + } + + + const Brig::BrigOperandOperandList *brigVecOp = + (const Brig::BrigOperandOperandList *)brigOp; + + unsigned *data_offset = + (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1)); + + const Brig::BrigOperand *p = + (const Brig::BrigOperand *)obj->getOperand(*data_offset); + + if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) { + return false; + } + + return init(*data_offset, obj); +} +template<typename T> +std::string +ImmOperand<T>::disassemble() +{ + return csprintf("0x%08x", bits); +} + +template<typename RegOperand, typename T> +class RegOrImmOperand : public BaseOperand +{ + private: + bool is_imm; + + public: + void setImm(const bool value) { is_imm = value; } + + ImmOperand<T> imm_op; + RegOperand reg_op; + + RegOrImmOperand() { is_imm = false; } + void init(unsigned opOffset, const BrigObject *obj); + void init_from_vect(unsigned opOffset, const BrigObject *obj, int at); + std::string disassemble(); + + template<typename OperandType> + OperandType + get(Wavefront *w, int lane) + { + return is_imm ? imm_op.template get<OperandType>() : + reg_op.template get<OperandType>(w, lane); + } + + uint32_t + opSize() + { + if (!is_imm) { + return reg_op.opSize(); + } + + return 0; + } + + bool + isVectorRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_VECTOR; + } + return false; + } + + bool + isCondRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_CONDITION; + } + + return false; + } + + bool + isScalarRegister() + { + if (!is_imm) { + return reg_op.registerType == Enums::RT_SCALAR; + } + + return false; + } + + unsigned int + regIndex() + { + if (!is_imm) { + return reg_op.regIndex(); + } + return 0; + } +}; + +template<typename RegOperand, typename T> +void +RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj) +{ + is_imm = false; + + if (reg_op.init(opOffset, obj)) { + return; + } + + if (imm_op.init(opOffset, obj)) { + is_imm = true; + return; + } + + fatal("RegOrImmOperand::init(): bad operand kind %d\n", + obj->getOperand(opOffset)->kind); +} + +template<typename RegOperand, typename T> +void +RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset, + const BrigObject *obj, int at) +{ + if (reg_op.init_from_vect(opOffset, obj, at)) { + is_imm = false; + + return; + } + + if (imm_op.init_from_vect(opOffset, obj, at)) { + is_imm = true; + + return; + } + + fatal("RegOrImmOperand::init(): bad operand kind %d\n", + obj->getOperand(opOffset)->kind); +} + +template<typename RegOperand, typename T> +std::string +RegOrImmOperand<RegOperand, T>::disassemble() +{ + return is_imm ? imm_op.disassemble() : reg_op.disassemble(); +} + +typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand; +typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand; +typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand; + +class AddrOperandBase : public BaseOperand +{ + protected: + // helper function for init() + void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj); + + // helper function for disassemble() + std::string disassemble(std::string reg_disassembly); + uint64_t calcUniformBase(); + + public: + virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0; + virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0; + + uint64_t offset; + const char *name = nullptr; + StorageElement *storageElement; +}; + +template<typename RegOperandType> +class RegAddrOperand : public AddrOperandBase +{ + public: + RegOperandType reg; + void init(unsigned opOffset, const BrigObject *obj); + uint64_t calcUniform(); + void calcVector(Wavefront *w, uint64_t *addrVec); + uint64_t calcLane(Wavefront *w, int lane=0); + uint32_t opSize() { return reg.opSize(); } + bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; } + bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; } + bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; } + unsigned int regIndex() { return reg.regIndex(); } + std::string disassemble(); +}; + +template<typename RegOperandType> +void +RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj) +{ + using namespace Brig; + + const BrigOperand *baseOp = obj->getOperand(opOffset); + + switch (baseOp->kind) { + case BRIG_KIND_OPERAND_ADDRESS: + { + const BrigOperandAddress *op = (BrigOperandAddress*)baseOp; + storageElement = nullptr; + + offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo); + reg.init(op->reg, obj); + + if (reg.regFileChar == 's') { + reg.regOperandSize = sizeof(uint32_t); + registerType = Enums::RT_VECTOR; + } + else if (reg.regFileChar == 'd') { + reg.regOperandSize = sizeof(uint64_t); + registerType = Enums::RT_VECTOR; + } + } + break; + + default: + fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind); + break; + } +} + +template<typename RegOperandType> +uint64_t +RegAddrOperand<RegOperandType>::calcUniform() +{ + fatal("can't do calcUniform() on register-based address\n"); + + return 0; +} + +template<typename RegOperandType> +void +RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec) +{ + Addr address = calcUniformBase(); + + for (int lane = 0; lane < VSZ; ++lane) { + if (w->execMask(lane)) { + if (reg.regFileChar == 's') { + addrVec[lane] = address + reg.template get<uint32_t>(w, lane); + } else { + addrVec[lane] = address + reg.template get<Addr>(w, lane); + } + } + } +} + +template<typename RegOperandType> +uint64_t +RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane) +{ + Addr address = calcUniformBase(); + + return address + reg.template get<Addr>(w, lane); +} + +template<typename RegOperandType> +std::string +RegAddrOperand<RegOperandType>::disassemble() +{ + return AddrOperandBase::disassemble(reg.disassemble()); +} + +typedef RegAddrOperand<SRegOperand> SRegAddrOperand; +typedef RegAddrOperand<DRegOperand> DRegAddrOperand; + +class NoRegAddrOperand : public AddrOperandBase +{ + public: + void init(unsigned opOffset, const BrigObject *obj); + uint64_t calcUniform(); + void calcVector(Wavefront *w, uint64_t *addrVec); + uint64_t calcLane(Wavefront *w, int lane=0); + std::string disassemble(); +}; + +inline uint64_t +NoRegAddrOperand::calcUniform() +{ + return AddrOperandBase::calcUniformBase(); +} + +inline uint64_t +NoRegAddrOperand::calcLane(Wavefront *w, int lane) +{ + return calcUniform(); +} + +inline void +NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec) +{ + uint64_t address = calcUniformBase(); + + for (int lane = 0; lane < VSZ; ++lane) + addrVec[lane] = address; +} + +class LabelOperand : public BaseOperand +{ + public: + Label *label; + + void init(unsigned opOffset, const BrigObject *obj); + std::string disassemble(); + + // special get method for compatibility with SRegOperand + uint32_t getTarget(Wavefront *w, int lane); + +}; + +class ListOperand : public BaseOperand +{ + public: + int elementCount; + std::vector<StorageElement*> callArgs; + + int + getSrcOperand(int idx) + { + DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx, + callArgs.size()); + + return callArgs.at(idx)->offset; + } + + void init(unsigned opOffset, const BrigObject *obj); + + std::string disassemble(); + + template<typename OperandType> + OperandType + get(Wavefront *w, int lane, int arg_idx) + { + return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx)); + } + + template<typename OperandType> + void + set(Wavefront *w, int lane, OperandType val) + { + w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val); + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n", + w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, + getSrcOperand(0), val); + } +}; + +class FunctionRefOperand : public BaseOperand +{ + public: + const char *func_name; + + void init(unsigned opOffset, const BrigObject *obj); + std::string disassemble(); +}; + +#endif // __ARCH_HSAIL_OPERAND_HH__ diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py new file mode 100644 index 000000000..bd95f6335 --- /dev/null +++ b/src/gpu-compute/GPU.py @@ -0,0 +1,310 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Steve Reinhardt +# + +from ClockedObject import ClockedObject +from Device import DmaDevice +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject +from MemObject import MemObject +from Process import EmulatedDriver +from Bridge import Bridge +from LdsState import LdsState + +class PrefetchType(Enum): vals = [ + 'PF_CU', + 'PF_PHASE', + 'PF_WF', + 'PF_STRIDE', + 'PF_END', + ] + +class VectorRegisterFile(SimObject): + type = 'VectorRegisterFile' + cxx_class = 'VectorRegisterFile' + cxx_header = 'gpu-compute/vector_register_file.hh' + + simd_id = Param.Int(0, 'SIMD ID associated with this VRF') + num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') + min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') + +class Wavefront(SimObject): + type = 'Wavefront' + cxx_class = 'Wavefront' + cxx_header = 'gpu-compute/wavefront.hh' + + simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') + wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') + +class ComputeUnit(MemObject): + type = 'ComputeUnit' + cxx_class = 'ComputeUnit' + cxx_header = 'gpu-compute/compute_unit.hh' + + wavefronts = VectorParam.Wavefront('Number of wavefronts') + wfSize = Param.Int(64, 'Wavefront size (in work items)') + num_SIMDs = Param.Int(4, 'number of SIMD units per CU') + + spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ + 'latency') + + dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ + 'latency') + + issue_period = Param.Int(4, 'number of cycles per issue period') + num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') + num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ + "Represents the pipeline to reach the TCP and "\ + "specified in GPU clock cycles") + mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ + "cu. Represents the pipeline between the TCP "\ + "and cu as well as TCP data array access. "\ + "Specified in GPU clock cycles") + system = Param.System(Parent.any, "system object") + cu_id = Param.Int('CU id') + vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ + "in bytes") + coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ + "in bytes") + + memory_port = VectorMasterPort("Port to the memory system") + translation_port = VectorMasterPort('Port to the TLB hierarchy') + sqc_port = MasterPort("Port to the SQC (I-cache") + sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") + perLaneTLB = Param.Bool(False, "enable per-lane TLB") + prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ + "(0 turns off prefetching)") + prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)") + prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\ + "from last mem req in lane of "\ + "CU|Phase|Wavefront") + execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); + xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); + debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") + functionalTLB = Param.Bool(False, "Assume TLB causes no delay") + + localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ + "kernel end") + + countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ + "and how many times") + global_mem_queue_size = Param.Int(256, "Number of entries in the global " + "memory pipeline's queues") + local_mem_queue_size = Param.Int(256, "Number of entries in the local " + "memory pipeline's queues") + ldsBus = Bridge() # the bridge between the CU and its LDS + ldsPort = MasterPort("The port that goes to the LDS") + localDataStore = Param.LdsState("the LDS for this CU") + + vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ + "file") + +class Shader(ClockedObject): + type = 'Shader' + cxx_class = 'Shader' + cxx_header = 'gpu-compute/shader.hh' + + CUs = VectorParam.ComputeUnit('Number of compute units') + n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into + ruby at kernel boundaries""") + separate_acquire_release = Param.Bool(False, + """Do ld_acquire/st_release generate separate requests for the + acquire and release?""") + globalmem = Param.MemorySize('64kB', 'Memory size') + timing = Param.Bool(False, 'timing memory accesses') + + cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") + translation = Param.Bool(False, "address translation"); + +class ClDriver(EmulatedDriver): + type = 'ClDriver' + cxx_header = 'gpu-compute/cl_driver.hh' + codefile = VectorParam.String('code file name(s)') + +class GpuDispatcher(DmaDevice): + type = 'GpuDispatcher' + cxx_header = 'gpu-compute/dispatcher.hh' + # put at 8GB line for now + pio_addr = Param.Addr(0x200000000, "Device Address") + pio_latency = Param.Latency('1ns', "Programmed IO latency") + shader_pointer = Param.Shader('pointer to shader') + translation_port = MasterPort('Port to the dispatcher TLB') + cpu = Param.BaseCPU("CPU to wake up on kernel completion") + + cl_driver = Param.ClDriver('pointer to driver') + +class OpType(Enum): vals = [ + 'OT_NULL', + 'OT_ALU', + 'OT_SPECIAL', + 'OT_GLOBAL_READ', + 'OT_GLOBAL_WRITE', + 'OT_GLOBAL_ATOMIC', + 'OT_GLOBAL_HIST', + 'OT_GLOBAL_LDAS', + 'OT_SHARED_READ', + 'OT_SHARED_WRITE', + 'OT_SHARED_ATOMIC', + 'OT_SHARED_HIST', + 'OT_SHARED_LDAS', + 'OT_PRIVATE_READ', + 'OT_PRIVATE_WRITE', + 'OT_PRIVATE_ATOMIC', + 'OT_PRIVATE_HIST', + 'OT_PRIVATE_LDAS', + 'OT_SPILL_READ', + 'OT_SPILL_WRITE', + 'OT_SPILL_ATOMIC', + 'OT_SPILL_HIST', + 'OT_SPILL_LDAS', + 'OT_READONLY_READ', + 'OT_READONLY_WRITE', + 'OT_READONLY_ATOMIC', + 'OT_READONLY_HIST', + 'OT_READONLY_LDAS', + 'OT_FLAT_READ', + 'OT_FLAT_WRITE', + 'OT_FLAT_ATOMIC', + 'OT_FLAT_HIST', + 'OT_FLAT_LDAS', + 'OT_KERN_READ', + 'OT_BRANCH', + + # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version + # of the compiler. + 'OT_SHARED_MEMFENCE', + 'OT_GLOBAL_MEMFENCE', + 'OT_BOTH_MEMFENCE', + + 'OT_BARRIER', + 'OT_PRINT', + 'OT_RET', + 'OT_NOP', + 'OT_ARG' + ] + +class MemType(Enum): vals = [ + 'M_U8', + 'M_U16', + 'M_U32', + 'M_U64', + 'M_S8', + 'M_S16', + 'M_S32', + 'M_S64', + 'M_F16', + 'M_F32', + 'M_F64', + ] + +class MemOpType(Enum): vals = [ + 'MO_LD', + 'MO_ST', + 'MO_LDAS', + 'MO_LDA', + 'MO_AAND', + 'MO_AOR', + 'MO_AXOR', + 'MO_ACAS', + 'MO_AEXCH', + 'MO_AADD', + 'MO_ASUB', + 'MO_AINC', + 'MO_ADEC', + 'MO_AMAX', + 'MO_AMIN', + 'MO_ANRAND', + 'MO_ANROR', + 'MO_ANRXOR', + 'MO_ANRCAS', + 'MO_ANREXCH', + 'MO_ANRADD', + 'MO_ANRSUB', + 'MO_ANRINC', + 'MO_ANRDEC', + 'MO_ANRMAX', + 'MO_ANRMIN', + 'MO_HAND', + 'MO_HOR', + 'MO_HXOR', + 'MO_HCAS', + 'MO_HEXCH', + 'MO_HADD', + 'MO_HSUB', + 'MO_HINC', + 'MO_HDEC', + 'MO_HMAX', + 'MO_HMIN', + 'MO_UNDEF' + ] + +class StorageClassType(Enum): vals = [ + 'SC_SPILL', + 'SC_GLOBAL', + 'SC_SHARED', + 'SC_PRIVATE', + 'SC_READONLY', + 'SC_KERNARG', + 'SC_NONE', + ] + +class RegisterType(Enum): vals = [ + 'RT_VECTOR', + 'RT_SCALAR', + 'RT_CONDITION', + 'RT_HARDWARE', + 'RT_NONE', + ] + +class GenericMemoryOrder(Enum): vals = [ + 'MEMORY_ORDER_NONE', + 'MEMORY_ORDER_RELAXED', + 'MEMORY_ORDER_SC_ACQUIRE', + 'MEMORY_ORDER_SC_RELEASE', + 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', + ] + +class GenericMemoryScope(Enum): vals = [ + 'MEMORY_SCOPE_NONE', + 'MEMORY_SCOPE_WORKITEM', + 'MEMORY_SCOPE_WAVEFRONT', + 'MEMORY_SCOPE_WORKGROUP', + 'MEMORY_SCOPE_DEVICE', + 'MEMORY_SCOPE_SYSTEM', + ] diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py new file mode 100644 index 000000000..6ea9f6427 --- /dev/null +++ b/src/gpu-compute/LdsState.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Joe Gross +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from MemObject import MemObject + +class LdsState(MemObject): + type = 'LdsState' + cxx_class = 'LdsState' + cxx_header = 'gpu-compute/lds_state.hh' + size = Param.Int(65536, 'the size of the LDS') + range = Param.AddrRange('64kB', "address space of the LDS") + bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\ + 'accessing data') + banks = Param.Int(32, 'Number of LDS banks') + cuPort = SlavePort("port that goes to the compute unit") diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript new file mode 100644 index 000000000..2de96df24 --- /dev/null +++ b/src/gpu-compute/SConscript @@ -0,0 +1,99 @@ +# -*- mode:python -*- + +# +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Anthony Gutierrez +# + +Import('*') + +if not env['BUILD_GPU']: + Return() + +SimObject('GPU.py') +SimObject('LdsState.py') +SimObject('X86GPUTLB.py') + +if env['TARGET_GPU_ISA'] == 'hsail': + Source('brig_object.cc') + Source('hsail_code.cc') + +Source('cl_driver.cc') +Source('compute_unit.cc') +Source('condition_register_state.cc') +Source('dispatcher.cc') +Source('exec_stage.cc') +Source('fetch_stage.cc') +Source('fetch_unit.cc') +Source('global_memory_pipeline.cc') +Source('gpu_dyn_inst.cc') +Source('gpu_exec_context.cc') +Source('gpu_static_inst.cc') +Source('gpu_tlb.cc') +Source('hsa_object.cc') +Source('kernel_cfg.cc') +Source('lds_state.cc') +Source('local_memory_pipeline.cc') +Source('of_scheduling_policy.cc') +Source('pool_manager.cc') +Source('rr_scheduling_policy.cc') +Source('schedule_stage.cc') +Source('scheduler.cc') +Source('scoreboard_check_stage.cc') +Source('shader.cc') +Source('simple_pool_manager.cc') +Source('tlb_coalescer.cc') +Source('vector_register_file.cc') +Source('vector_register_state.cc') +Source('wavefront.cc') + +DebugFlag('BRIG') +DebugFlag('GPUCoalescer') +DebugFlag('GPUDisp') +DebugFlag('GPUExec') +DebugFlag('GPUFetch') +DebugFlag('GPUHsailCFInfo') +DebugFlag('GPUMem') +DebugFlag('GPUPort') +DebugFlag('GPUPrefetch') +DebugFlag('GPUReg') +DebugFlag('GPUSync') +DebugFlag('GPUTLB') +DebugFlag('HSALoader') +DebugFlag('HSAIL') +DebugFlag('HSAILObject') +DebugFlag('Predictor') +DebugFlag('WavefrontStack') + +CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', + 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL']) diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py new file mode 100644 index 000000000..51f8e514e --- /dev/null +++ b/src/gpu-compute/X86GPUTLB.py @@ -0,0 +1,77 @@ +# +# Copyright (c) 2011-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Lisa Hsu +# + +from m5.defines import buildEnv +from m5.params import * +from m5.proxy import * + +from m5.objects.MemObject import MemObject + +if buildEnv['FULL_SYSTEM']: + class X86PagetableWalker(MemObject): + type = 'X86PagetableWalker' + cxx_class = 'X86ISA::Walker' + port = SlavePort("Port for the hardware table walker") + system = Param.System(Parent.any, "system object") + +class X86GPUTLB(MemObject): + type = 'X86GPUTLB' + cxx_class = 'X86ISA::GpuTLB' + cxx_header = 'gpu-compute/gpu_tlb.hh' + size = Param.Int(64, "TLB size (number of entries)") + assoc = Param.Int(64, "TLB associativity") + + if buildEnv['FULL_SYSTEM']: + walker = Param.X86PagetableWalker(X86PagetableWalker(), + "page table walker") + + hitLatency = Param.Int(2, "Latency of a TLB hit") + missLatency1 = Param.Int(5, "Latency #1 of a TLB miss") + missLatency2 = Param.Int(100, "Latency #2 of a TLB miss") + maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + allocationPolicy = Param.Bool(True, "Allocate on an access") + accessDistance = Param.Bool(False, "print accessDistance stats") + +class TLBCoalescer(MemObject): + type = 'TLBCoalescer' + cxx_class = 'TLBCoalescer' + cxx_header = 'gpu-compute/tlb_coalescer.hh' + probesPerCycle = Param.Int(2, "Number of TLB probes per cycle") + coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks") + slave = VectorSlavePort("Port on side closer to CPU/CU") + master = VectorMasterPort("Port on side closer to memory") + disableCoalescing = Param.Bool(False,"Dispable Coalescing") diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc new file mode 100644 index 000000000..7cc9b7cc4 --- /dev/null +++ b/src/gpu-compute/brig_object.cc @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#include "gpu-compute/brig_object.hh" + +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +#include <cassert> +#include <cstddef> +#include <cstdlib> + +#include "arch/hsail/Brig.h" +#include "base/misc.hh" +#include "base/trace.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "debug/HSALoader.hh" + +using namespace Brig; + +std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>> + HsaObject::tryFileFuncs = { BrigObject::tryFile }; + +extern int getBrigDataTypeBytes(BrigType16_t t); + +const char *BrigObject::sectionNames[] = +{ + "hsa_data", + "hsa_code", + "hsa_operand", + ".shstrtab" +}; + +const char *segmentNames[] = +{ + "none", + "flat", + "global", + "readonly", + "kernarg", + "group", + "private", + "spill", + "args" +}; + +const uint8_t* +BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const +{ + // allow offs == size for dummy end pointers + assert(offs <= sectionInfo[sec].size); + + return sectionInfo[sec].ptr + offs; +} + +const char* +BrigObject::getString(int offs) const +{ + return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4); +} + +const BrigBase* +BrigObject::getCodeSectionEntry(int offs) const +{ + return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs); +} + +const BrigData* +BrigObject::getBrigBaseData(int offs) const +{ + return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs)); +} + +const uint8_t* +BrigObject::getData(int offs) const +{ + return getSectionOffset(DataSectionIndex, offs); +} + +const BrigOperand* +BrigObject::getOperand(int offs) const +{ + return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs); +} + +unsigned +BrigObject::getOperandPtr(int offs, int index) const +{ + unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1))); + + return *op_offs; +} + +const BrigInstBase* +BrigObject::getInst(int offs) const +{ + return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs); +} + +HsaCode* +BrigObject::getKernel(const std::string &name) const +{ + return nullptr; +} + +HsaCode* +BrigObject::getFunction(const std::string &name) const +{ + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + return functions[i]; + } + } + + return nullptr; +} + +void +BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr, + StorageMap *storageMap) +{ + while (dirPtr < endPtr) { + if (!dirPtr->byteCount) { + fatal("Bad directive size 0\n"); + } + + // calculate next pointer now so we can override it if needed + const BrigBase *nextDirPtr = brigNext(dirPtr); + + DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n", + dirPtr->kind, dirPtr->byteCount); + + switch (dirPtr->kind) { + case BRIG_KIND_DIRECTIVE_FUNCTION: + { + const BrigDirectiveExecutable *p M5_VAR_USED = + reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: " + "%d next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + if (p->firstCodeBlockEntry != p->nextModuleEntry) { + panic("Function calls are not fully supported yet!!: %s\n", + getString(p->name)); + + const char *name = getString(p->name); + + HsailCode *code_obj = nullptr; + + for (int i = 0; i < functions.size(); ++i) { + if (functions[i]->name() == name) { + code_obj = functions[i]; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(name, p, this, + new StorageMap(storageMap)); + functions.push_back(code_obj); + } else { + panic("Multiple definition of Function!!: %s\n", + getString(p->name)); + } + + } + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_KERNEL: + { + const BrigDirectiveExecutable *p = + reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr); + + DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: " + "next: %d\n", getString(p->name), + p->firstCodeBlockEntry, p->nextModuleEntry); + + const char *name = getString(p->name); + + if (name[0] == '&') + name++; + + std::string str = name; + char *temp; + int len = str.length(); + + if (str[len - 1] >= 'a' && str[len - 1] <= 'z') { + temp = new char[str.size() + 1]; + std::copy(str.begin(), str.end() , temp); + temp[str.size()] = '\0'; + } else { + temp = new char[str.size()]; + std::copy(str.begin(), str.end() - 1 , temp); + temp[str.size() - 1 ] = '\0'; + } + + std::string kernel_name = temp; + delete[] temp; + + HsailCode *code_obj = nullptr; + + for (const auto &kernel : kernels) { + if (kernel->name() == kernel_name) { + code_obj = kernel; + break; + } + } + + if (!code_obj) { + // create new local storage map for kernel-local symbols + code_obj = new HsailCode(kernel_name, p, this, + new StorageMap(storageMap)); + + kernels.push_back(code_obj); + } + + nextDirPtr = getCodeSectionEntry(p->nextModuleEntry); + } + break; + + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *p = + reinterpret_cast<const BrigDirectiveVariable*>(dirPtr); + + uint64_t readonlySize_old = + storageMap->getSize(BRIG_SEGMENT_READONLY); + + StorageElement* se = storageMap->addSymbol(p, this); + + DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n", + getString(p->name)); + + if (p->segment == BRIG_SEGMENT_READONLY) { + // readonly memory has initialization data + uint8_t* readonlyData_old = readonlyData; + + readonlyData = + new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)]; + + if (p->init) { + if ((p->type == BRIG_TYPE_ROIMG) || + (p->type == BRIG_TYPE_WOIMG) || + (p->type == BRIG_TYPE_SAMP) || + (p->type == BRIG_TYPE_SIG32) || + (p->type == BRIG_TYPE_SIG64)) { + panic("Read only data type not supported: %s\n", + getString(p->name)); + } + + const BrigOperand *brigOp = getOperand(p->init); + assert(brigOp->kind == + BRIG_KIND_OPERAND_CONSTANT_BYTES); + + const Brig::BrigData *operand_data M5_VAR_USED = + getBrigBaseData(((BrigOperandConstantBytes*) + brigOp)->bytes); + + assert((operand_data->byteCount / 4) > 0); + + uint8_t *symbol_data = + (uint8_t*)getData(((BrigOperandConstantBytes*) + brigOp)->bytes + 4); + + // copy the old data and add the new data + if (readonlySize_old > 0) { + memcpy(readonlyData, readonlyData_old, + readonlySize_old); + } + + memcpy(readonlyData + se->offset, symbol_data, + se->size); + + delete[] readonlyData_old; + } + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel M5_VAR_USED *p = + reinterpret_cast<const BrigDirectiveLabel*>(dirPtr); + + panic("Label directives cannot be at the module level: %s\n", + getString(p->name)); + + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + const BrigDirectiveComment M5_VAR_USED *p = + reinterpret_cast<const BrigDirectiveComment*>(dirPtr); + + DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_LOC: + { + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_MODULE: + { + const BrigDirectiveModule M5_VAR_USED *p = + reinterpret_cast<const BrigDirectiveModule*>(dirPtr); + + DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n", + getString(p->name)); + } + break; + + case BRIG_KIND_DIRECTIVE_CONTROL: + { + DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_EXTENSION: + { + DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n"); + } + break; + default: + if (dirPtr->kind >= BRIG_KIND_INST_BEGIN && + dirPtr->kind <= BRIG_KIND_INST_END) + break; + + if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + dirPtr->kind <= BRIG_KIND_OPERAND_END) + break; + + warn("Unknown Brig directive kind: %d\n", dirPtr->kind); + break; + } + + dirPtr = nextDirPtr; + } +} + +HsaObject* +BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData) +{ + const char *brig_ident = "HSA BRIG"; + + if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH)) + return nullptr; + + return new BrigObject(fname, len, fileData); +} + +BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData) + : HsaObject(fname), storageMap(new StorageMap()) +{ + const char *brig_ident = "HSA BRIG"; + BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData; + + fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH), + "%s is not a BRIG file\n", fname); + + if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR || + mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) { + fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n", + fname, mod_hdr->brigMajor, mod_hdr->brigMinor, + BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR); + } + + fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section " + "count (%d) != expected value (%d)\n", fname, + mod_hdr->sectionCount, NumSectionIndices); + + for (int i = 0; i < NumSectionIndices; ++i) { + sectionInfo[i].ptr = nullptr; + } + + uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex); + for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) { + uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx]; + BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr; + + // It doesn't look like cprintf supports string precision values, + // but if this breaks, the right answer is to fix that + DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength, + sec_hdr->name); + + sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount]; + memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount); + sectionInfo[sec_idx].size = sec_hdr->byteCount; + } + + BrigSectionHeader *code_hdr = + (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr; + + DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, " + "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount, + code_hdr->nameLength); + + // start at offset 4 to skip initial null entry (see Brig spec) + processDirectives(getCodeSectionEntry(code_hdr->headerByteCount), + getCodeSectionEntry(sectionInfo[CodeSectionIndex].size), + storageMap); + + delete[] fileData; + + DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname); +} + +BrigObject::~BrigObject() +{ + for (int i = 0; i < NumSectionIndices; ++i) + if (sectionInfo[i].ptr) + delete[] sectionInfo[i].ptr; +} diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh new file mode 100644 index 000000000..59a585914 --- /dev/null +++ b/src/gpu-compute/brig_object.hh @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt, Anthony Gutierrez + */ + +#ifndef __BRIG_OBJECT_HH__ +#define __BRIG_OBJECT_HH__ + +#include <cassert> +#include <cstdint> +#include <string> +#include <vector> + +#include "arch/hsail/Brig.h" +#include "gpu-compute/hsa_object.hh" +#include "gpu-compute/hsail_code.hh" + +class LabelMap; +class StorageMap; + +/* @class BrigObject + * this class implements the BRIG loader object, and + * is used when the simulator directly executes HSAIL. + * this class is responsible for extracting all + * information about kernels contained in BRIG format + * and converts them to HsailCode objects that are + * usable by the simulator and emulated runtime. + */ + +class BrigObject final : public HsaObject +{ + public: + enum SectionIndex + { + DataSectionIndex, + CodeSectionIndex, + OperandsSectionIndex, + NumSectionIndices + }; + + static const char *sectionNames[]; + + struct SectionInfo + { + uint8_t *ptr; + int size; + }; + + static HsaObject* tryFile(const std::string &fname, int len, + uint8_t *fileData); + + SectionInfo sectionInfo[NumSectionIndices]; + const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const; + + std::vector<HsailCode*> kernels; + std::vector<HsailCode*> functions; + std::string kern_block_name; + + void processDirectives(const Brig::BrigBase *dirPtr, + const Brig::BrigBase *endPtr, + StorageMap *storageMap); + + BrigObject(const std::string &fname, int len, uint8_t *fileData); + ~BrigObject(); + + // eventually these will need to be per-kernel not per-object-file + StorageMap *storageMap; + LabelMap *labelMap; + + const char* getString(int offs) const; + const Brig::BrigData* getBrigBaseData(int offs) const; + const uint8_t* getData(int offs) const; + const Brig::BrigBase* getCodeSectionEntry(int offs) const; + const Brig::BrigOperand* getOperand(int offs) const; + unsigned getOperandPtr(int offs, int index) const; + const Brig::BrigInstBase* getInst(int offs) const; + + HsaCode* getKernel(const std::string &name) const override; + HsaCode* getFunction(const std::string &name) const override; + + int numKernels() const override { return kernels.size(); } + + HsaCode* getKernel(int i) const override { return kernels[i]; } + + // pointer to the current kernel/function we're processing, so elements + // under construction can reference it. kinda ugly, but easier + // than passing it all over for the few places it's needed. + mutable HsailCode *currentCode; +}; + +// Utility function to bump Brig item pointer to next element given +// item size in bytes. Really just an add but with lots of casting. +template<typename T> +T* +brigNext(T *ptr) +{ + Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr; + int size = base_ptr->byteCount; + assert(size); + + return (T*)((uint8_t*)ptr + size); +} + +#endif // __BRIG_OBJECT_HH__ diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc new file mode 100644 index 000000000..3b3291c03 --- /dev/null +++ b/src/gpu-compute/cl_driver.cc @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/cl_driver.hh" + +#include "base/intmath.hh" +#include "cpu/thread_context.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/hsa_object.hh" +#include "params/ClDriver.hh" +#include "sim/process.hh" +#include "sim/syscall_emul_buf.hh" + +ClDriver::ClDriver(ClDriverParams *p) + : EmulatedDriver(p), hsaCode(0) +{ + for (const auto &codeFile : p->codefile) + codeFiles.push_back(&codeFile); + + maxFuncArgsSize = 0; + + for (int i = 0; i < codeFiles.size(); ++i) { + HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]); + + for (int k = 0; k < obj->numKernels(); ++k) { + assert(obj->getKernel(k)); + kernels.push_back(obj->getKernel(k)); + kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData); + int kern_funcargs_size = kernels.back()->funcarg_size; + maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ? + kern_funcargs_size : maxFuncArgsSize; + } + } + + int name_offs = 0; + int code_offs = 0; + + for (int i = 0; i < kernels.size(); ++i) { + kernelInfo.push_back(HsaKernelInfo()); + HsaCode *k = kernels[i]; + + k->generateHsaKernelInfo(&kernelInfo[i]); + + kernelInfo[i].name_offs = name_offs; + kernelInfo[i].code_offs = code_offs; + + name_offs += k->name().size() + 1; + code_offs += k->numInsts() * sizeof(GPUStaticInst*); + } +} + +void +ClDriver::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; + dispatcher->setFuncargsSize(maxFuncArgsSize); +} + +int +ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags) +{ + int fd = p->allocFD(-1, filename, 0, 0, false); + FDEntry *fde = p->getFDEntry(fd); + fde->driver = this; + + return fd; +} + +int +ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req) +{ + int index = 2; + Addr buf_addr = process->getSyscallArg(tc, index); + + switch (req) { + case HSA_GET_SIZES: + { + TypedBufferArg<HsaDriverSizes> sizes(buf_addr); + sizes->num_kernels = kernels.size(); + sizes->string_table_size = 0; + sizes->code_size = 0; + sizes->readonly_size = 0; + + if (kernels.size() > 0) { + // all kernels will share the same read-only memory + sizes->readonly_size = + kernels[0]->getSize(HsaCode::MemorySegment::READONLY); + // check our assumption + for (int i = 1; i<kernels.size(); ++i) { + assert(sizes->readonly_size == + kernels[i]->getSize(HsaCode::MemorySegment::READONLY)); + } + } + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + // add one for terminating '\0' + sizes->string_table_size += k->name().size() + 1; + sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*); + } + + sizes.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_KINFO: + { + TypedBufferArg<HsaKernelInfo> + kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size()); + + for (int i = 0; i < kernels.size(); ++i) { + HsaKernelInfo *ki = &kinfo[i]; + ki->name_offs = kernelInfo[i].name_offs; + ki->code_offs = kernelInfo[i].code_offs; + ki->sRegCount = kernelInfo[i].sRegCount; + ki->dRegCount = kernelInfo[i].dRegCount; + ki->cRegCount = kernelInfo[i].cRegCount; + ki->static_lds_size = kernelInfo[i].static_lds_size; + ki->private_mem_size = kernelInfo[i].private_mem_size; + ki->spill_mem_size = kernelInfo[i].spill_mem_size; + } + + kinfo.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_STRINGS: + { + int string_table_size = 0; + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + string_table_size += k->name().size() + 1; + } + + BufferArg buf(buf_addr, string_table_size); + char *bufp = (char*)buf.bufferPtr(); + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + const char *n = k->name().c_str(); + + // idiomatic string copy + while ((*bufp++ = *n++)); + } + + assert(bufp - (char *)buf.bufferPtr() == string_table_size); + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_READONLY_DATA: + { + // we can pick any kernel --- they share the same + // readonly segment (this assumption is checked in GET_SIZES) + uint64_t size = + kernels.back()->getSize(HsaCode::MemorySegment::READONLY); + BufferArg data(buf_addr, size); + char *datap = (char *)data.bufferPtr(); + memcpy(datap, + kernels.back()->readonly_data, + size); + data.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CODE: + { + // set hsaCode pointer + hsaCode = buf_addr; + int code_size = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst); + } + + TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size); + TheGpuISA::RawMachInst *bufp = buf; + + int buf_idx = 0; + + for (int i = 0; i < kernels.size(); ++i) { + HsaCode *k = kernels[i]; + + for (int j = 0; j < k->numInsts(); ++j) { + bufp[buf_idx] = k->insts()->at(j); + ++buf_idx; + } + } + + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_CU_CNT: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs(); + buf.copyOut(tc->getMemProxy()); + } + break; + + case HSA_GET_VSZ: + { + BufferArg buf(buf_addr, sizeof(uint32_t)); + *((uint32_t*)buf.bufferPtr()) = VSZ; + buf.copyOut(tc->getMemProxy()); + } + break; + + default: + fatal("ClDriver: bad ioctl %d\n", req); + } + + return 0; +} + +const char* +ClDriver::codeOffToKernelName(uint64_t code_ptr) +{ + assert(hsaCode); + uint32_t code_offs = code_ptr - hsaCode; + + for (int i = 0; i < kernels.size(); ++i) { + if (code_offs == kernelInfo[i].code_offs) { + return kernels[i]->name().c_str(); + } + } + + return nullptr; +} + +ClDriver* +ClDriverParams::create() +{ + return new ClDriver(this); +} diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh new file mode 100644 index 000000000..03567bab5 --- /dev/null +++ b/src/gpu-compute/cl_driver.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CL_DRIVER_HH__ +#define __CL_DRIVER_HH__ + +#include <vector> + +#include "gpu-compute/hsa_kernel_info.hh" +#include "sim/emul_driver.hh" + +class GpuDispatcher; +class HsaCode; +class LiveProcess; +class ThreadContext; + +struct ClDriverParams; + +class ClDriver final : public EmulatedDriver +{ + public: + ClDriver(ClDriverParams *p); + void handshake(GpuDispatcher *_dispatcher); + int open(LiveProcess *p, ThreadContext *tc, int mode, int flags); + int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req); + const char* codeOffToKernelName(uint64_t code_ptr); + + private: + GpuDispatcher *dispatcher; + + std::vector<const std::string*> codeFiles; + + // All the kernels we know about + std::vector<HsaCode*> kernels; + std::vector<HsaCode*> functions; + + std::vector<HsaKernelInfo> kernelInfo; + + // maximum size necessary for function arguments + int maxFuncArgsSize; + // The host virtual address for the kernel code + uint64_t hsaCode; +}; + +#endif // __CL_DRIVER_HH__ diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh new file mode 100644 index 000000000..75297a2d2 --- /dev/null +++ b/src/gpu-compute/cl_event.hh @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Marc Orr + */ + +#ifndef __GPU_CL_EVENT_HH__ +#define __GPU_CL_EVENT_HH__ + +struct HsaQueueEntry; + +class _cl_event { + public: + _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { } + + volatile bool done; + HsaQueueEntry *hsaTaskPtr; + uint64_t start; + uint64_t end; +}; + +#endif // __GPU_CL_EVENT_HH__ diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh new file mode 100644 index 000000000..126cf6c50 --- /dev/null +++ b/src/gpu-compute/code_enums.hh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __CODE_ENUMS_HH__ +#define __CODE_ENUMS_HH__ + +#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ + && (a)<=Enums::OT_GLOBAL_LDAS) +#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ + && (a)<=Enums::OT_SHARED_LDAS) +#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ + && (a)<=Enums::OT_PRIVATE_LDAS) +#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ + && (a)<=Enums::OT_SPILL_LDAS) +#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ + && (a)<=Enums::OT_READONLY_LDAS) +#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) + +#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ + ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ + ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) + +#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ + ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) + +#define IS_OT_READ_GM(a) \ + ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ + ||(a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) + +#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) + +#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) + +#define IS_OT_WRITE(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ + ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) + +#define IS_OT_WRITE_GM(a) \ + ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ + ||(a)==Enums::OT_READONLY_WRITE) + +#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) + +#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) + +#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_PRIVATE_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_FLAT_ATOMIC) + +#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ + ||(a)==Enums::OT_SPILL_ATOMIC \ + ||(a)==Enums::OT_READONLY_ATOMIC \ + ||(a)==Enums::OT_GLOBAL_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ + ||(a)==Enums::OT_SHARED_MEMFENCE \ + ||(a)==Enums::OT_BOTH_MEMFENCE) + +#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) + +#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SHARED_HIST \ + ||(a)==Enums::OT_PRIVATE_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST \ + ||(a)==Enums::OT_FLAT_HIST) + +#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ + ||(a)==Enums::OT_SPILL_HIST \ + ||(a)==Enums::OT_READONLY_HIST) + +#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) + +#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) + +#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc new file mode 100644 index 000000000..d3622007a --- /dev/null +++ b/src/gpu-compute/compute_unit.cc @@ -0,0 +1,1817 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#include "gpu-compute/compute_unit.hh" + +#include "base/output.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUExec.hh" +#include "debug/GPUFetch.hh" +#include "debug/GPUMem.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUSync.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/page_table.hh" +#include "sim/process.hh" + +ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), + scoreboardCheckStage(p), scheduleStage(p), execStage(p), + globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0), + cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs), + spBypassPipeLength(p->spbypass_pipe_length), + dpBypassPipeLength(p->dpbypass_pipe_length), + issuePeriod(p->issue_period), + numGlbMemUnits(p->num_global_mem_pipes), + numLocMemUnits(p->num_shared_mem_pipes), + perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth), + prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type), + xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault), + functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier), + countPages(p->countPages), barrier_id(0), + vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), + coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), + req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), + resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), + _masterId(p->system->getMasterId(name() + ".ComputeUnit")), + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) +{ + // this check will be eliminated once we have wavefront size support added + fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ"); + // calculate how many cycles a vector load or store will need to transfer + // its data over the corresponding buses + numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t)) + / (double)vrfToCoalescerBusWidth); + + numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t)) + / coalescerToVrfBusWidth; + + lastVaddrWF.resize(numSIMDs); + wfList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + lastVaddrWF[j].resize(p->n_wf); + + for (int i = 0; i < p->n_wf; ++i) { + lastVaddrWF[j][i].resize(VSZ); + + wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); + wfList[j][i]->setParent(this); + + for (int k = 0; k < VSZ; ++k) { + lastVaddrWF[j][i][k] = 0; + } + } + } + + lastVaddrPhase.resize(numSIMDs); + + for (int i = 0; i < numSIMDs; ++i) { + lastVaddrPhase[i] = LastVaddrWave(); + } + + lastVaddrCU = LastVaddrWave(); + + lds.setParent(this); + + if (p->execPolicy == "OLDEST-FIRST") { + exec_policy = EXEC_POLICY::OLDEST; + } else if (p->execPolicy == "ROUND-ROBIN") { + exec_policy = EXEC_POLICY::RR; + } else { + fatal("Invalid WF execution policy (CU)\n"); + } + + memPort.resize(VSZ); + + // resize the tlbPort vectorArray + int tlbPort_width = perLaneTLB ? VSZ : 1; + tlbPort.resize(tlbPort_width); + + cuExitCallback = new CUExitCallback(this); + registerExitCallback(cuExitCallback); + + xactCasLoadMap.clear(); + lastExecCycle.resize(numSIMDs, 0); + + for (int i = 0; i < vrf.size(); ++i) { + vrf[i]->setParent(this); + } + + numVecRegsPerSimd = vrf[0]->numRegs(); +} + +ComputeUnit::~ComputeUnit() +{ + // Delete wavefront slots + + for (int j = 0; j < numSIMDs; ++j) + for (int i = 0; i < shader->n_wf; ++i) { + delete wfList[j][i]; + } + + readyList.clear(); + waveStatusList.clear(); + dispatchList.clear(); + vectorAluInstAvail.clear(); + delete cuExitCallback; + delete ldsPort; +} + +void +ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr) +{ + w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); + + w->workgroupsz[0] = ndr->q.wgSize[0]; + w->workgroupsz[1] = ndr->q.wgSize[1]; + w->workgroupsz[2] = ndr->q.wgSize[2]; + w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2]; + w->gridsz[0] = ndr->q.gdSize[0]; + w->gridsz[1] = ndr->q.gdSize[1]; + w->gridsz[2] = ndr->q.gdSize[2]; + w->kernelArgs = ndr->q.args; + w->privSizePerItem = ndr->q.privMemPerItem; + w->spillSizePerItem = ndr->q.spillMemPerItem; + w->roBase = ndr->q.roMemStart; + w->roSize = ndr->q.roMemTotal; +} + +void +ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart) +{ + wfCtx->cnt = cnt; + + VectorMask init_mask; + init_mask.reset(); + + for (int k = 0; k < VSZ; ++k) { + if (k + cnt * VSZ < trueWgSizeTotal) + init_mask[k] = 1; + } + + wfCtx->init_mask = init_mask.to_ullong(); + wfCtx->exec_mask = init_mask.to_ullong(); + + for (int i = 0; i < VSZ; ++i) { + wfCtx->bar_cnt[i] = 0; + } + + wfCtx->max_bar_cnt = 0; + wfCtx->old_barrier_cnt = 0; + wfCtx->barrier_cnt = 0; + + wfCtx->privBase = ndr->q.privMemStart; + ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ; + + wfCtx->spillBase = ndr->q.spillMemStart; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ; + + wfCtx->pc = 0; + wfCtx->rpc = UINT32_MAX; + + // set the wavefront context to have a pointer to this section of the LDS + wfCtx->ldsChunk = ldsChunk; + + // WG state + wfCtx->wg_id = ndr->globalWgId; + wfCtx->barrier_id = barrier_id; + + // Kernel wide state + wfCtx->ndr = ndr; +} + +void +ComputeUnit::updateEvents() { + + if (!timestampVec.empty()) { + uint32_t vecSize = timestampVec.size(); + uint32_t i = 0; + while (i < vecSize) { + if (timestampVec[i] <= shader->tick_cnt) { + std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i]; + vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t), + statusVec[i]); + timestampVec.erase(timestampVec.begin() + i); + regIdxVec.erase(regIdxVec.begin() + i); + statusVec.erase(statusVec.begin() + i); + --vecSize; + --i; + } + ++i; + } + } + + for (int i = 0; i< numSIMDs; ++i) { + vrf[i]->updateEvents(); + } +} + + +void +ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal) +{ + static int _n_wave = 0; + int cnt = wfCtx->cnt; + NDRange *ndr = wfCtx->ndr; + + // Fill in Kernel state + FillKernelState(w, ndr); + + w->kern_id = ndr->dispatchId; + w->dynwaveid = cnt; + w->init_mask = wfCtx->init_mask; + + for (int k = 0; k < VSZ; ++k) { + w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0]; + w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1]; + w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]); + + w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] * + trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] + + w->workitemid[0][k]; + } + + w->old_barrier_cnt = wfCtx->old_barrier_cnt; + w->barrier_cnt = wfCtx->barrier_cnt; + w->barrier_slots = divCeil(trueWgSizeTotal, VSZ); + + for (int i = 0; i < VSZ; ++i) { + w->bar_cnt[i] = wfCtx->bar_cnt[i]; + } + + w->max_bar_cnt = wfCtx->max_bar_cnt; + w->privBase = wfCtx->privBase; + w->spillBase = wfCtx->spillBase; + + w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask); + + // WG state + w->wg_id = wfCtx->wg_id; + w->dispatchid = wfCtx->ndr->dispatchId; + w->workgroupid[0] = w->wg_id % ndr->numWg[0]; + w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; + w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); + + w->barrier_id = wfCtx->barrier_id; + w->stalledAtBarrier = false; + + // move this from the context into the actual wavefront + w->ldsChunk = wfCtx->ldsChunk; + + int32_t refCount M5_VAR_USED = + lds.increaseRefCounter(w->dispatchid, w->wg_id); + DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", + cu_id, w->wg_id, refCount); + + w->instructionBuffer.clear(); + + if (w->pendingFetch) + w->dropFetch = true; + + // is this the last wavefront in the workgroup + // if set the spillWidth to be the remaining work-items + // so that the vector access is correct + if ((cnt + 1) * VSZ >= trueWgSizeTotal) { + w->spillWidth = trueWgSizeTotal - (cnt * VSZ); + } else { + w->spillWidth = VSZ; + } + + DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " + "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); + + w->start(++_n_wave, ndr->q.code_ptr); +} + +void +ComputeUnit::StartWorkgroup(NDRange *ndr) +{ + // reserve the LDS capacity allocated to the work group + // disambiguated by the dispatch ID and workgroup ID, which should be + // globally unique + LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId, + ndr->q.ldsSize); + + // Send L1 cache acquire + // isKernel + isAcquire = Kernel Begin + if (shader->impl_kern_boundary_sync) { + GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr, + nullptr, + nullptr, 0); + + gpuDynInst->useContinuation = false; + gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; + gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; + injectGlobalMemFence(gpuDynInst, true); + } + + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + } + + uint64_t origSpillMemStart = ndr->q.spillMemStart; + // calculate the number of 32-bit vector registers required by wavefront + int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + int cnt = 0; + + // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time + for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { + Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; + // Check if this wavefront slot is available: + // It must be stopped and not waiting + // for a release to complete S_RETURNING + if (w->status == Wavefront::S_STOPPED) { + // if we have scheduled all work items then stop + // scheduling wavefronts + if (cnt * VSZ >= trueWgSizeTotal) + break; + + // reserve vector registers for the scheduled wavefront + assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); + uint32_t normSize = 0; + + w->startVgprIndex = vrf[m % numSIMDs]->manager-> + allocateRegion(vregDemand, &normSize); + + w->reservedVectorRegs = normSize; + vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; + + WFContext wfCtx; + + InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal, + ldsChunk, origSpillMemStart); + + StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal); + ++cnt; + } + } + ++barrier_id; +} + +int +ComputeUnit::ReadyWorkgroup(NDRange *ndr) +{ + // Get true size of workgroup (after clamping to grid size) + int trueWgSize[3]; + int trueWgSizeTotal = 1; + + for (int d = 0; d < 3; ++d) { + trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - + ndr->wgId[d] * ndr->q.wgSize[d]); + + trueWgSizeTotal *= trueWgSize[d]; + DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); + } + + DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); + + // calculate the number of 32-bit vector registers required by each + // work item of the work group + int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); + bool vregAvail = true; + int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ; + int freeWfSlots = 0; + // check if the total number of VGPRs required by all WFs of the WG + // fit in the VRFs of all SIMD units + assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd)); + int numMappedWfs = 0; + std::vector<int> numWfsPerSimd; + numWfsPerSimd.resize(numSIMDs, 0); + // find how many free WF slots we have across all SIMDs + for (int j = 0; j < shader->n_wf; ++j) { + for (int i = 0; i < numSIMDs; ++i) { + if (wfList[i][j]->status == Wavefront::S_STOPPED) { + // count the number of free WF slots + ++freeWfSlots; + if (numMappedWfs < numWfs) { + // count the WFs to be assigned per SIMD + numWfsPerSimd[i]++; + } + numMappedWfs++; + } + } + } + + // if there are enough free WF slots then find if there are enough + // free VGPRs per SIMD based on the WF->SIMD mapping + if (freeWfSlots >= numWfs) { + for (int j = 0; j < numSIMDs; ++j) { + // find if there are enough free VGPR regions in the SIMD's VRF + // to accommodate the WFs of the new WG that would be mapped to + // this SIMD unit + vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j], + vregDemandPerWI); + + // stop searching if there is at least one SIMD + // whose VRF does not have enough free VGPR pools. + // This is because a WG is scheduled only if ALL + // of its WFs can be scheduled + if (!vregAvail) + break; + } + } + + DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n", + freeWfSlots, vregAvail); + + if (!vregAvail) { + ++numTimesWgBlockedDueVgprAlloc; + } + + // Return true if enough WF slots to submit workgroup and if there are + // enough VGPRs to schedule all WFs to their SIMD units + if (!lds.canReserve(ndr->q.ldsSize)) { + wgBlockedDueLdsAllocation++; + } + + // Return true if (a) there are enough free WF slots to submit + // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their + // SIMD units and (c) if there is enough space in LDS + return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize); +} + +int +ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) +{ + DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id); + int ccnt = 0; + + for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) { + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) { + Wavefront *w = wfList[i_simd][i_wf]; + + if (w->status == Wavefront::S_RUNNING) { + DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf); + + DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n", + w->barrier_id, _barrier_id); + + DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n", + w->barrier_cnt, bcnt); + } + + if (w->status == Wavefront::S_RUNNING && + w->barrier_id == _barrier_id && w->barrier_cnt == bcnt && + !w->outstanding_reqs) { + ++ccnt; + + DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to " + "%d\n", i_simd, i_wf, ccnt); + } + } + } + + DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n", + cu_id, ccnt, bslots); + + return ccnt == bslots; +} + +// Check if the current wavefront is blocked on additional resources. +bool +ComputeUnit::cedeSIMD(int simdId, int wfSlotId) +{ + bool cede = false; + + // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld + // magic instructions will impact the scheduling of wavefronts + if (xact_cas_mode) { + /* + * When a wavefront calls xact_cas_ld, it adds itself to a per address + * queue. All per address queues are managed by the xactCasLoadMap. + * + * A wavefront is not blocked if: it is not in ANY per address queue or + * if it is at the head of a per address queue. + */ + for (auto itMap : xactCasLoadMap) { + std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue; + + if (!curWaveIDQueue.empty()) { + for (auto it : curWaveIDQueue) { + waveIdentifier cur_wave = it; + + if (cur_wave.simdId == simdId && + cur_wave.wfSlotId == wfSlotId) { + // 2 possibilities + // 1: this WF has a green light + // 2: another WF has a green light + waveIdentifier owner_wave = curWaveIDQueue.front(); + + if (owner_wave.simdId != cur_wave.simdId || + owner_wave.wfSlotId != cur_wave.wfSlotId) { + // possibility 2 + cede = true; + break; + } else { + // possibility 1 + break; + } + } + } + } + } + } + + return cede; +} + +// Execute one clock worth of work on the ComputeUnit. +void +ComputeUnit::exec() +{ + updateEvents(); + // Execute pipeline stages in reverse order to simulate + // the pipeline latency + globalMemoryPipe.exec(); + localMemoryPipe.exec(); + execStage.exec(); + scheduleStage.exec(); + scoreboardCheckStage.exec(); + fetchStage.exec(); + + totalCycles++; +} + +void +ComputeUnit::init() +{ + // Initialize CU Bus models + glbMemToVrfBus.init(&shader->tick_cnt, 1); + locMemToVrfBus.init(&shader->tick_cnt, 1); + nextGlbMemBus = 0; + nextLocMemBus = 0; + fatal_if(numGlbMemUnits > 1, + "No support for multiple Global Memory Pipelines exists!!!"); + vrfToGlobalMemPipeBus.resize(numGlbMemUnits); + for (int j = 0; j < numGlbMemUnits; ++j) { + vrfToGlobalMemPipeBus[j] = WaitClass(); + vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + + fatal_if(numLocMemUnits > 1, + "No support for multiple Local Memory Pipelines exists!!!"); + vrfToLocalMemPipeBus.resize(numLocMemUnits); + for (int j = 0; j < numLocMemUnits; ++j) { + vrfToLocalMemPipeBus[j] = WaitClass(); + vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1); + } + vectorRegsReserved.resize(numSIMDs, 0); + aluPipe.resize(numSIMDs); + wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits); + + for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) { + wfWait[i] = WaitClass(); + wfWait[i].init(&shader->tick_cnt, 1); + } + + for (int i = 0; i < numSIMDs; ++i) { + aluPipe[i] = WaitClass(); + aluPipe[i].init(&shader->tick_cnt, 1); + } + + // Setup space for call args + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + wfList[j][i]->initCallArgMem(shader->funcargs_size); + } + } + + // Initializing pipeline resources + readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits); + waveStatusList.resize(numSIMDs); + + for (int j = 0; j < numSIMDs; ++j) { + for (int i = 0; i < shader->n_wf; ++i) { + waveStatusList[j].push_back( + std::make_pair(wfList[j][i], BLOCKED)); + } + } + + for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) { + dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY)); + } + + fetchStage.init(this); + scoreboardCheckStage.init(this); + scheduleStage.init(this); + execStage.init(this); + globalMemoryPipe.init(this); + localMemoryPipe.init(this); + // initialize state for statistics calculation + vectorAluInstAvail.resize(numSIMDs, false); + shrMemInstAvail = 0; + glbMemInstAvail = 0; +} + +bool +ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) +{ + // Ruby has completed the memory op. Schedule the mem_resp_event at the + // appropriate cycle to process the timing memory response + // This delay represents the pipeline delay + SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState); + int index = sender_state->port_index; + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + + // Is the packet returned a Kernel End or Barrier + if (pkt->req->isKernel() && pkt->req->isRelease()) { + Wavefront *w = + computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + + // Check if we are waiting on Kernel End Release + if (w->status == Wavefront::S_RETURNING) { + DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n", + computeUnit->cu_id, w->simdId, w->wfSlotId, + w->wfDynId, w->kern_id); + + computeUnit->shader->dispatcher->notifyWgCompl(w); + w->status = Wavefront::S_STOPPED; + } else { + w->outstanding_reqs--; + } + + DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, w->barrier_cnt); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; + return true; + } + + ComputeUnit::DataPort::MemRespEvent *mem_resp_event = + new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index], + pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + index, pkt->req->getPaddr()); + + computeUnit->schedule(mem_resp_event, + curTick() + computeUnit->resp_tick_latency); + return true; +} + +void +ComputeUnit::DataPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second; + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n", + computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr()); + + /** Currently Ruby can return false due to conflicts for the particular + * cache block or address. Thus other requests should be allowed to + * pass and the data port should expect multiple retries. */ + if (!sendTimingReq(pkt)) { + DPRINTF(GPUMem, "failed again!\n"); + break; + } else { + DPRINTF(GPUMem, "successful!\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) +{ + computeUnit->fetchStage.processFetchReturn(pkt); + + return true; +} + +void +ComputeUnit::SQCPort::recvReqRetry() +{ + int len = retries.size(); + + assert(len > 0); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front().first; + Wavefront *wavefront M5_VAR_USED = retries.front().second; + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + if (!sendTimingReq(pkt)) { + DPRINTF(GPUFetch, "failed again!\n"); + break; + } else { + DPRINTF(GPUFetch, "successful!\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + // There must be a way around this check to do the globalMemStart... + Addr tmp_vaddr = pkt->req->getVaddr(); + + updatePageDivergenceDist(tmp_vaddr); + + pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(), + pkt->req->getFlags(), pkt->req->masterId(), + pkt->req->getPC()); + + // figure out the type of the request to set read/write + BaseTLB::Mode TLB_mode; + assert(pkt->isRead() || pkt->isWrite()); + + // Check write before read for atomic operations + // since atomic operations should use BaseTLB::Write + if (pkt->isWrite()){ + TLB_mode = BaseTLB::Write; + } else if (pkt->isRead()) { + TLB_mode = BaseTLB::Read; + } else { + fatal("pkt is not a read nor a write\n"); + } + + tlbCycles -= curTick(); + ++tlbRequests; + + int tlbPort_index = perLaneTLB ? index : 0; + + if (shader->timingSim) { + if (debugSegFault) { + Process *p = shader->gpuTc->getProcessPtr(); + Addr vaddr = pkt->req->getVaddr(); + unsigned size = pkt->getSize(); + + if ((vaddr + size - 1) % 64 < vaddr % 64) { + panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); + } + + Addr paddr; + + if (!p->pTable->translate(vaddr, paddr)) { + if (!p->fixupStackFault(vaddr)) { + panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + vaddr); + } + } + } + + // This is the SenderState needed upon return + pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); + + // This is the senderState needed by the TLB hierarchy to function + TheISA::GpuTLB::TranslationState *translation_state = + new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false, + pkt->senderState); + + pkt->senderState = translation_state; + + if (functionalTLB) { + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + assert(hit_level != -1); + hitsPerTLBLevel[hit_level]++; + + // New SenderState for the memory access + X86ISA::GpuTLB::TranslationState *sender_state = + safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state->saved; + delete sender_state; + + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + uint8_t *tmpData = pkt->getPtr<uint8_t>(); + + // this is necessary because the GPU TLB receives packets instead + // of requests. when the translation is complete, all relevent + // fields in the request will be populated, but not in the packet. + // here we create the new packet so we can set the size, addr, + // and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + pkt->dataStatic(tmpData); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, + index, nullptr); + + gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); + gpuDynInst->tlbHitLevel[index] = hit_level; + + + // translation is done. Schedule the mem_req_event at the + // appropriate cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " + "scheduled\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } else if (tlbPort[tlbPort_index]->isStalled()) { + assert(tlbPort[tlbPort_index]->retries.size() > 0); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet will be issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + tlbPort[tlbPort_index]->stallPort(); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " + "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + tmp_vaddr); + + tlbPort[tlbPort_index]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, + "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); + } + } else { + if (pkt->cmd == MemCmd::MemFenceReq) { + gpuDynInst->statusBitVector = VectorMask(0); + } else { + gpuDynInst->statusBitVector &= (~(1ll << index)); + } + + // New SenderState for the memory access + delete pkt->senderState; + + // Because it's atomic operation, only need TLB translation state + pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, + shader->gpuTc); + + tlbPort[tlbPort_index]->sendFunctional(pkt); + + // the addr of the packet is not modified, so we need to create a new + // packet, or otherwise the memory access will have the old virtual + // address sent in the translation packet, instead of the physical + // address returned by the translation. + PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd); + new_pkt->dataStatic(pkt->getPtr<uint8_t>()); + + // Translation is done. It is safe to send the packet to memory. + memPort[0]->sendFunctional(new_pkt); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, + gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + new_pkt->req->getPaddr()); + + // safe_cast the senderState + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + delete sender_state->tlbEntry; + delete new_pkt; + delete pkt->senderState; + delete pkt->req; + delete pkt; + } +} + +void +ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +{ + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt); + + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, + nullptr); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, + pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); +} + +void +ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, + Request* req) +{ + if (!req) { + req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1); + } + req->setPaddr(0); + if (kernelLaunch) { + req->setFlags(Request::KERNEL); + } + + gpuDynInst->s_type = SEG_GLOBAL; + + // for non-kernel MemFence operations, memorder flags are set depending + // on which type of request is currently being sent, so this + // should be set by the caller (e.g. if an inst has acq-rel + // semantics, it will send one acquire req an one release req) + gpuDynInst->setRequestFlags(req, kernelLaunch); + + // a mem fence must correspond to an acquire/release request + assert(req->isAcquire() || req->isRelease()); + + // create packet + PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq); + + // set packet's sender state + pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr); + + // send the packet + sendSyncRequest(gpuDynInst, 0, pkt); +} + +const char* +ComputeUnit::DataPort::MemRespEvent::description() const +{ + return "ComputeUnit memory response event"; +} + +void +ComputeUnit::DataPort::MemRespEvent::process() +{ + DataPort::SenderState *sender_state = + safe_cast<DataPort::SenderState*>(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit = dataPort->computeUnit; + + assert(gpuDynInst); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", + compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, + pkt->req->getPaddr(), dataPort->index); + + Addr paddr = pkt->req->getPaddr(); + + if (pkt->cmd != MemCmd::MemFenceResp) { + int index = gpuDynInst->memStatusVector[paddr].back(); + + DPRINTF(GPUMem, "Response for addr %#x, index %d\n", + pkt->req->getPaddr(), index); + + gpuDynInst->memStatusVector[paddr].pop_back(); + gpuDynInst->pAddr = pkt->req->getPaddr(); + + if (pkt->isRead() || pkt->isWrite()) { + + if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) { + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } else { + assert(gpuDynInst->statusVector[index] > 0); + gpuDynInst->statusVector[index]--; + + if (!gpuDynInst->statusVector[index]) + gpuDynInst->statusBitVector &= (~(1ULL << index)); + } + + DPRINTF(GPUMem, "bitvector is now %#x\n", + gpuDynInst->statusBitVector); + + if (gpuDynInst->statusBitVector == VectorMask(0)) { + auto iter = gpuDynInst->memStatusVector.begin(); + auto end = gpuDynInst->memStatusVector.end(); + + while (iter != end) { + assert(iter->second.empty()); + ++iter; + } + + gpuDynInst->memStatusVector.clear(); + + if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + gpuDynInst->statusVector.clear(); + + if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) + || MO_ANR(gpuDynInst->m_op)) { + assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMLdRespFIFO() + .push(gpuDynInst); + } else { + assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy()); + + compute_unit->globalMemoryPipe.getGMStRespFIFO() + .push(gpuDynInst); + } + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + + // after clearing the status vectors, + // see if there is a continuation to perform + // the continuation may generate more work for + // this memory request + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + } + } else { + gpuDynInst->statusBitVector = VectorMask(0); + + if (gpuDynInst->useContinuation) { + assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), + gpuDynInst); + } + } + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +ComputeUnit* +ComputeUnitParams::create() +{ + return new ComputeUnit(this); +} + +bool +ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line = pkt->req->getPaddr(); + + DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id, + pkt->req->getVaddr(), line); + + assert(pkt->senderState); + computeUnit->tlbCycles += curTick(); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + // no PageFaults are permitted for data accesses + if (!translation_state->tlbEntry->valid) { + DTLBPort::SenderState *sender_state = + safe_cast<DTLBPort::SenderState*>(translation_state->saved); + + Wavefront *w M5_VAR_USED = + computeUnit->wfList[sender_state->_gpuDynInst->simdId] + [sender_state->_gpuDynInst->wfSlotId]; + + DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId, + pkt->req->getVaddr()); + } + + assert(translation_state->tlbEntry->valid); + + // update the hitLevel distribution + int hit_level = translation_state->hitLevel; + computeUnit->hitsPerTLBLevel[hit_level]++; + + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + + // for prefetch pkt + BaseTLB::Mode TLB_mode = translation_state->tlbMode; + + delete translation_state; + + // use the original sender state to know how to close this transaction + DTLBPort::SenderState *sender_state = + safe_cast<DTLBPort::SenderState*>(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + int mp_index = sender_state->portIndex; + Addr vaddr = pkt->req->getVaddr(); + gpuDynInst->memStatusVector[line].push_back(mp_index); + gpuDynInst->tlbHitLevel[mp_index] = hit_level; + + MemCmd requestCmd; + + if (pkt->cmd == MemCmd::ReadResp) { + requestCmd = MemCmd::ReadReq; + } else if (pkt->cmd == MemCmd::WriteResp) { + requestCmd = MemCmd::WriteReq; + } else if (pkt->cmd == MemCmd::SwapResp) { + requestCmd = MemCmd::SwapReq; + } else { + panic("unsupported response to request conversion %s\n", + pkt->cmd.toString()); + } + + if (computeUnit->prefetchDepth) { + int simdId = gpuDynInst->simdId; + int wfSlotId = gpuDynInst->wfSlotId; + Addr last = 0; + + switch(computeUnit->prefetchType) { + case Enums::PF_CU: + last = computeUnit->lastVaddrCU[mp_index]; + break; + case Enums::PF_PHASE: + last = computeUnit->lastVaddrPhase[simdId][mp_index]; + break; + case Enums::PF_WF: + last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; + default: + break; + } + + DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n", + computeUnit->cu_id, simdId, wfSlotId, mp_index, last); + + int stride = last ? (roundDown(vaddr, TheISA::PageBytes) - + roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift + : 0; + + DPRINTF(GPUPrefetch, "Stride is %d\n", stride); + + computeUnit->lastVaddrCU[mp_index] = vaddr; + computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr; + computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; + + stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? + computeUnit->prefetchStride: stride; + + DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr, + computeUnit->cu_id, simdId, wfSlotId, mp_index); + + DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr); + + // Prefetch Next few pages atomically + for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) { + DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride, + vaddr+stride*pf*TheISA::PageBytes); + + if (!stride) + break; + + Request *prefetch_req = new Request(0, vaddr + stride * pf * + TheISA::PageBytes, + sizeof(uint8_t), 0, + computeUnit->masterId(), + 0, 0, 0); + + PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd); + uint8_t foo = 0; + prefetch_pkt->dataStatic(&foo); + + // Because it's atomic operation, only need TLB translation state + prefetch_pkt->senderState = + new TheISA::GpuTLB::TranslationState(TLB_mode, + computeUnit->shader->gpuTc, + true); + + // Currently prefetches are zero-latency, hence the sendFunctional + sendFunctional(prefetch_pkt); + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *tlb_state = + safe_cast<TheISA::GpuTLB::TranslationState*>( + prefetch_pkt->senderState); + + + delete tlb_state->tlbEntry; + delete tlb_state; + delete prefetch_pkt->req; + delete prefetch_pkt; + } + } + + // First we must convert the response cmd back to a request cmd so that + // the request can be sent through the cu's master port + PacketPtr new_pkt = new Packet(pkt->req, requestCmd); + new_pkt->dataStatic(pkt->getPtr<uint8_t>()); + delete pkt->senderState; + delete pkt; + + // New SenderState for the memory access + new_pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index, + nullptr); + + // translation is done. Schedule the mem_req_event at the appropriate + // cycle to send the timing memory request to ruby + ComputeUnit::DataPort::MemReqEvent *mem_req_event = + new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index], + new_pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr()); + + computeUnit->schedule(mem_req_event, curTick() + + computeUnit->req_tick_latency); + + return true; +} + +const char* +ComputeUnit::DataPort::MemReqEvent::description() const +{ + return "ComputeUnit memory request event"; +} + +void +ComputeUnit::DataPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState); + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit; + + if (!(dataPort->sendTimingReq(pkt))) { + dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst)); + + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, dataPort->index, + pkt->req->getPaddr()); + } +} + +/* + * The initial translation request could have been rejected, + * if <retries> queue is not Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::DTLBPort::recvReqRetry() +{ + int len = retries.size(); + + DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n", + computeUnit->cu_id, len); + + assert(len > 0); + assert(isStalled()); + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + // Stall port + stallPort(); + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +bool +ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) +{ + Addr line M5_VAR_USED = pkt->req->getPaddr(); + DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n", + computeUnit->cu_id, pkt->req->getVaddr(), line); + + assert(pkt->senderState); + + // pop off the TLB translation state + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + bool success = translation_state->tlbEntry->valid; + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + pkt->senderState = translation_state->saved; + delete translation_state; + + // use the original sender state to know how to close this transaction + ITLBPort::SenderState *sender_state = + safe_cast<ITLBPort::SenderState*>(pkt->senderState); + + // get the wavefront associated with this translation request + Wavefront *wavefront = sender_state->wavefront; + delete pkt->senderState; + + if (success) { + // pkt is reused in fetch(), don't delete it here. However, we must + // reset the command to be a request so that it can be sent through + // the cu's master port + assert(pkt->cmd == MemCmd::ReadResp); + pkt->cmd = MemCmd::ReadReq; + + computeUnit->fetchStage.fetch(pkt, wavefront); + } else { + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } + + wavefront->pendingFetch = 0; + } + + return true; +} + +/* + * The initial translation request could have been rejected, if + * <retries> queue is not empty. Retry sending the translation + * request. sendRetry() is called from the peer port whenever + * a translation completes. + */ +void +ComputeUnit::ITLBPort::recvReqRetry() +{ + + int len = retries.size(); + DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len); + + assert(len > 0); + assert(isStalled()); + + // recvReqRetry is an indication that the resource on which this + // port was stalling on is freed. So, remove the stall first + unstallPort(); + + for (int i = 0; i < len; ++i) { + PacketPtr pkt = retries.front(); + Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); + DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr); + + if (!sendTimingReq(pkt)) { + stallPort(); // Stall port + DPRINTF(GPUTLB, ": failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": successful\n"); + retries.pop_front(); + } + } +} + +void +ComputeUnit::regStats() +{ + tlbCycles + .name(name() + ".tlb_cycles") + .desc("total number of cycles for all uncoalesced requests") + ; + + tlbRequests + .name(name() + ".tlb_requests") + .desc("number of uncoalesced requests") + ; + + tlbLatency + .name(name() + ".avg_translation_latency") + .desc("Avg. translation latency for data translations") + ; + + tlbLatency = tlbCycles / tlbRequests; + + hitsPerTLBLevel + .init(4) + .name(name() + ".TLB_hits_distribution") + .desc("TLB hits distribution (0 for page table, x for Lx-TLB") + ; + + // fixed number of TLB levels + for (int i = 0; i < 4; ++i) { + if (!i) + hitsPerTLBLevel.subname(i,"page_table"); + else + hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); + } + + execRateDist + .init(0, 10, 2) + .name(name() + ".inst_exec_rate") + .desc("Instruction Execution Rate: Number of executed vector " + "instructions per cycle") + ; + + ldsBankConflictDist + .init(0, VSZ, 2) + .name(name() + ".lds_bank_conflicts") + .desc("Number of bank conflicts per LDS memory packet") + ; + + ldsBankAccesses + .name(name() + ".lds_bank_access_cnt") + .desc("Total number of LDS bank accesses") + ; + + pageDivergenceDist + // A wavefront can touch 1 to VSZ pages per memory instruction. + // The number of pages per bin can be configured (here it's 4). + .init(1, VSZ, 4) + .name(name() + ".page_divergence_dist") + .desc("pages touched per wf (over all mem. instr.)") + ; + + controlFlowDivergenceDist + .init(1, VSZ, 4) + .name(name() + ".warp_execution_dist") + .desc("number of lanes active per instruction (oval all instructions)") + ; + + activeLanesPerGMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".gmem_lanes_execution_dist") + .desc("number of active lanes per global memory instruction") + ; + + activeLanesPerLMemInstrDist + .init(1, VSZ, 4) + .name(name() + ".lmem_lanes_execution_dist") + .desc("number of active lanes per local memory instruction") + ; + + numInstrExecuted + .name(name() + ".num_instr_executed") + .desc("number of instructions executed") + ; + + numVecOpsExecuted + .name(name() + ".num_vec_ops_executed") + .desc("number of vec ops executed (e.g. VSZ/inst)") + ; + + totalCycles + .name(name() + ".num_total_cycles") + .desc("number of cycles the CU ran for") + ; + + ipc + .name(name() + ".ipc") + .desc("Instructions per cycle (this CU only)") + ; + + vpc + .name(name() + ".vpc") + .desc("Vector Operations per cycle (this CU only)") + ; + + numALUInstsExecuted + .name(name() + ".num_alu_insts_executed") + .desc("Number of dynamic non-GM memory insts executed") + ; + + wgBlockedDueLdsAllocation + .name(name() + ".wg_blocked_due_lds_alloc") + .desc("Workgroup blocked due to LDS capacity") + ; + + ipc = numInstrExecuted / totalCycles; + vpc = numVecOpsExecuted / totalCycles; + + numTimesWgBlockedDueVgprAlloc + .name(name() + ".times_wg_blocked_due_vgpr_alloc") + .desc("Number of times WGs are blocked due to VGPR allocation per SIMD") + ; + + dynamicGMemInstrCnt + .name(name() + ".global_mem_instr_cnt") + .desc("dynamic global memory instructions count") + ; + + dynamicLMemInstrCnt + .name(name() + ".local_mem_instr_cnt") + .desc("dynamic local memory intruction count") + ; + + numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - + dynamicLMemInstrCnt; + + completedWfs + .name(name() + ".num_completed_wfs") + .desc("number of completed wavefronts") + ; + + numCASOps + .name(name() + ".num_CAS_ops") + .desc("number of compare and swap operations") + ; + + numFailedCASOps + .name(name() + ".num_failed_CAS_ops") + .desc("number of compare and swap operations that failed") + ; + + // register stats of pipeline stages + fetchStage.regStats(); + scoreboardCheckStage.regStats(); + scheduleStage.regStats(); + execStage.regStats(); + + // register stats of memory pipeline + globalMemoryPipe.regStats(); + localMemoryPipe.regStats(); +} + +void +ComputeUnit::updatePageDivergenceDist(Addr addr) +{ + Addr virt_page_addr = roundDown(addr, TheISA::PageBytes); + + if (!pagesTouched.count(virt_page_addr)) + pagesTouched[virt_page_addr] = 1; + else + pagesTouched[virt_page_addr]++; +} + +void +ComputeUnit::CUExitCallback::process() +{ + if (computeUnit->countPages) { + std::ostream *page_stat_file = + simout.create(computeUnit->name().c_str()); + + *page_stat_file << "page, wavefront accesses, workitem accesses" << + std::endl; + + for (auto iter : computeUnit->pageAccesses) { + *page_stat_file << std::hex << iter.first << ","; + *page_stat_file << std::dec << iter.second.first << ","; + *page_stat_file << std::dec << iter.second.second << std::endl; + } + } + } + +bool +ComputeUnit::isDone() const +{ + for (int i = 0; i < numSIMDs; ++i) { + if (!isSimdDone(i)) { + return false; + } + } + + bool glbMemBusRdy = true; + for (int j = 0; j < numGlbMemUnits; ++j) { + glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy(); + } + bool locMemBusRdy = true; + for (int j = 0; j < numLocMemUnits; ++j) { + locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy(); + } + + if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() || + !globalMemoryPipe.isGMStRespFIFOWrRdy() || + !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() + || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || + !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) { + return false; + } + + return true; +} + +int32_t +ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const +{ + return lds.getRefCounter(dispatchId, wgId); +} + +bool +ComputeUnit::isSimdDone(uint32_t simdId) const +{ + assert(simdId < numSIMDs); + + for (int i=0; i < numGlbMemUnits; ++i) { + if (!vrfToGlobalMemPipeBus[i].rdy()) + return false; + } + for (int i=0; i < numLocMemUnits; ++i) { + if (!vrfToLocalMemPipeBus[i].rdy()) + return false; + } + if (!aluPipe[simdId].rdy()) { + return false; + } + + for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ + if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) { + return false; + } + } + + return true; +} + +/** + * send a general request to the LDS + * make sure to look at the return value here as your request might be + * NACK'd and returning false means that you have to have some backup plan + */ +bool +ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) +{ + // this is just a request to carry the GPUDynInstPtr + // back and forth + Request *newRequest = new Request(); + newRequest->setPaddr(0x0); + + // ReadReq is not evaluted by the LDS but the Packet ctor requires this + PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq); + + // This is the SenderState needed upon return + newPacket->senderState = new LDSPort::SenderState(gpuDynInst); + + return ldsPort->sendTimingReq(newPacket); +} + +/** + * get the result of packets sent to the LDS when they return + */ +bool +ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet) +{ + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState); + + fatal_if(!senderState, "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + delete packet->senderState; + delete packet->req; + delete packet; + + computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst); + return true; +} + +/** + * attempt to send this packet, either the port is already stalled, the request + * is nack'd and must stall or the request goes through + * when a request cannot be sent, add it to the retries queue + */ +bool +ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt) +{ + ComputeUnit::LDSPort::SenderState *sender_state = + dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState); + fatal_if(!sender_state, "packet without a valid sender state"); + + GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst(); + + if (isStalled()) { + fatal_if(retries.empty(), "must have retries waiting to be stalled"); + + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); + return false; + } else if (!MasterPort::sendTimingReq(pkt)) { + // need to stall the LDS port until a recvReqRetry() is received + // this indicates that there is more space + stallPort(); + retries.push(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return false; + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, pkt->req->getPaddr()); + return true; + } +} + +/** + * the bus is telling the port that there is now space so retrying stalled + * requests should work now + * this allows the port to have a request be nack'd and then have the receiver + * say when there is space, rather than simply retrying the send every cycle + */ +void +ComputeUnit::LDSPort::recvReqRetry() +{ + auto queueSize = retries.size(); + + DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n", + computeUnit->cu_id, queueSize); + + fatal_if(queueSize < 1, + "why was there a recvReqRetry() with no pending reqs?"); + fatal_if(!isStalled(), + "recvReqRetry() happened when the port was not stalled"); + + unstallPort(); + + while (!retries.empty()) { + PacketPtr packet = retries.front(); + + DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id); + + if (!MasterPort::sendTimingReq(packet)) { + // Stall port + stallPort(); + DPRINTF(GPUPort, ": LDS send failed again\n"); + break; + } else { + DPRINTF(GPUTLB, ": LDS send successful\n"); + retries.pop(); + } + } +} diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh new file mode 100644 index 000000000..f47c27a0a --- /dev/null +++ b/src/gpu-compute/compute_unit.hh @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Anthony Gutierrez + */ + +#ifndef __COMPUTE_UNIT_HH__ +#define __COMPUTE_UNIT_HH__ + +#include <deque> +#include <map> +#include <unordered_map> +#include <vector> + +#include "base/callback.hh" +#include "base/statistics.hh" +#include "base/types.hh" +#include "enums/PrefetchType.hh" +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/fetch_stage.hh" +#include "gpu-compute/global_memory_pipeline.hh" +#include "gpu-compute/local_memory_pipeline.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/schedule_stage.hh" +#include "gpu-compute/scoreboard_check_stage.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" + +static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; +static const int MAX_WIDTH_FOR_MEM_INST = 32; + +class NDRange; +class Shader; +class VectorRegisterFile; + +struct ComputeUnitParams; + +enum EXEC_POLICY +{ + OLDEST = 0, + RR +}; + +// List of execution units +enum EXEC_UNIT +{ + SIMD0 = 0, + SIMD1, + SIMD2, + SIMD3, + GLBMEM_PIPE, + LDSMEM_PIPE, + NUM_UNITS +}; + +enum TLB_CACHE +{ + TLB_MISS_CACHE_MISS = 0, + TLB_MISS_CACHE_HIT, + TLB_HIT_CACHE_MISS, + TLB_HIT_CACHE_HIT +}; + +class ComputeUnit : public MemObject +{ + public: + FetchStage fetchStage; + ScoreboardCheckStage scoreboardCheckStage; + ScheduleStage scheduleStage; + ExecStage execStage; + GlobalMemPipeline globalMemoryPipe; + LocalMemPipeline localMemoryPipe; + + // Buffers used to communicate between various pipeline stages + + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list. readyList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: make enum to index readyList + std::vector<std::vector<Wavefront*>> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList. waveStatusList is + // used to communicate between scoreboardCheck stage and + // schedule stage + // TODO: convert std::pair to a class to increase readability + std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + // TODO: convert std::pair to a class to increase readability + std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList; + + int rrNextMemID; // used by RR WF exec policy to cycle through WF's + int rrNextALUWp; + typedef ComputeUnitParams Params; + std::vector<std::vector<Wavefront*>> wfList; + int cu_id; + + // array of vector register files, one per SIMD + std::vector<VectorRegisterFile*> vrf; + // Number of vector ALU units (SIMDs) in CU + int numSIMDs; + // number of pipe stages for bypassing data to next dependent single + // precision vector instruction inside the vector ALU pipeline + int spBypassPipeLength; + // number of pipe stages for bypassing data to next dependent double + // precision vector instruction inside the vector ALU pipeline + int dpBypassPipeLength; + // number of cycles per issue period + int issuePeriod; + + // Number of global and local memory execution resources in CU + int numGlbMemUnits; + int numLocMemUnits; + // tracks the last cycle a vector instruction was executed on a SIMD + std::vector<uint64_t> lastExecCycle; + + // true if we allow a separate TLB per lane + bool perLaneTLB; + // if 0, TLB prefetching is off. + int prefetchDepth; + // if fixed-stride prefetching, this is the stride. + int prefetchStride; + + class LastVaddrWave + { + public: + Addr vaddrs[VSZ]; + Addr& operator[](int idx) { + return vaddrs[idx]; + } + + LastVaddrWave() { + for (int i = 0; i < VSZ; ++i) + vaddrs[i] = 0; + } + }; + + LastVaddrWave lastVaddrCU; + std::vector<LastVaddrWave> lastVaddrPhase; + std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; + Enums::PrefetchType prefetchType; + EXEC_POLICY exec_policy; + + bool xact_cas_mode; + bool debugSegFault; + bool functionalTLB; + bool localMemBarrier; + + /* + * for Counting page accesses + * + * cuExitCallback inherits from Callback. When you register a callback + * function as an exit callback, it will get added to an exit callback + * queue, such that on simulation exit, all callbacks in the callback + * queue will have their process() function called. + */ + bool countPages; + + Shader *shader; + uint32_t barrier_id; + // vector of Vector ALU (MACC) pipelines + std::vector<WaitClass> aluPipe; + // minimum issue period per SIMD unit (in cycles) + std::vector<WaitClass> wfWait; + + // Resource control for Vector Register File->Global Memory pipe buses + std::vector<WaitClass> vrfToGlobalMemPipeBus; + // Resource control for Vector Register File->Local Memory pipe buses + std::vector<WaitClass> vrfToLocalMemPipeBus; + int nextGlbMemBus; + int nextLocMemBus; + // Resource control for global memory to VRF data/address bus + WaitClass glbMemToVrfBus; + // Resource control for local memory to VRF data/address bus + WaitClass locMemToVrfBus; + + uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes + uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes + uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store + uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load + + Tick req_tick_latency; + Tick resp_tick_latency; + + // number of vector registers being reserved for each SIMD unit + std::vector<int> vectorRegsReserved; + // number of vector registers per SIMD unit + uint32_t numVecRegsPerSimd; + // Support for scheduling VGPR status update events + std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; + std::vector<uint64_t> timestampVec; + std::vector<uint8_t> statusVec; + + void + registerEvent(uint32_t simdId, + uint32_t regIdx, + uint32_t operandSize, + uint64_t when, + uint8_t newStatus) { + regIdxVec.push_back(std::make_pair(simdId, regIdx)); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + if (operandSize > 4) { + regIdxVec.push_back(std::make_pair(simdId, + ((regIdx + 1) % + numVecRegsPerSimd))); + timestampVec.push_back(when); + statusVec.push_back(newStatus); + } + } + + void updateEvents(); + + // this hash map will keep track of page divergence + // per memory instruction per wavefront. The hash map + // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. + std::map<Addr, int> pagesTouched; + + ComputeUnit(const Params *p); + ~ComputeUnit(); + int spBypassLength() { return spBypassPipeLength; }; + int dpBypassLength() { return dpBypassPipeLength; }; + int storeBusLength() { return numCyclesPerStoreTransfer; }; + int loadBusLength() { return numCyclesPerLoadTransfer; }; + int wfSize() const { return wavefrontSize; }; + + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + void exec(); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void FillKernelState(Wavefront *w, NDRange *ndr); + + void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], + int trueWgSizeTotal); + + void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, + int trueWgSize[], int trueWgSizeTotal, + LdsChunk *ldsChunk, uint64_t origSpillMemStart); + + void StartWorkgroup(NDRange *ndr); + int ReadyWorkgroup(NDRange *ndr); + + bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } + bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } + bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } + int GlbMemUnitId() { return GLBMEM_PIPE; } + int ShrMemUnitId() { return LDSMEM_PIPE; } + int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } + int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } + /* This function cycles through all the wavefronts in all the phases to see + * if all of the wavefronts which should be associated with one barrier + * (denoted with _barrier_id), are all at the same barrier in the program + * (denoted by bcnt). When the number at the barrier matches bslots, then + * return true. + */ + int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); + bool cedeSIMD(int simdId, int wfSlotId); + + template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); + virtual void init(); + void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, + bool kernelLaunch=true, + RequestPtr req=nullptr); + void handleMemPacket(PacketPtr pkt, int memport_index); + bool processTimingPacket(PacketPtr pkt); + void processFetchReturn(PacketPtr pkt); + void updatePageDivergenceDist(Addr addr); + + MasterID masterId() { return _masterId; } + + bool isDone() const; + bool isSimdDone(uint32_t) const; + + protected: + MasterID _masterId; + + LdsState &lds; + + public: + // the following stats compute the avg. TLB accesslatency per + // uncoalesced request (only for data) + Stats::Scalar tlbRequests; + Stats::Scalar tlbCycles; + Stats::Formula tlbLatency; + // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. + Stats::Vector hitsPerTLBLevel; + + Stats::Scalar ldsBankAccesses; + Stats::Distribution ldsBankConflictDist; + + // over all memory instructions executed over all wavefronts + // how many touched 0-4 pages, 4-8, ..., 60-64 pages + Stats::Distribution pageDivergenceDist; + Stats::Scalar dynamicGMemInstrCnt; + Stats::Scalar dynamicLMemInstrCnt; + + Stats::Scalar wgBlockedDueLdsAllocation; + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active + // when the instruction is committed, this number is still incremented by 1 + Stats::Scalar numInstrExecuted; + // Number of cycles among successive instruction executions across all + // wavefronts of the same CU + Stats::Distribution execRateDist; + // number of individual vector operations executed + Stats::Scalar numVecOpsExecuted; + // Total cycles that something is running on the GPU + Stats::Scalar totalCycles; + Stats::Formula vpc; // vector ops per cycle + Stats::Formula ipc; // vector instructions per cycle + Stats::Distribution controlFlowDivergenceDist; + Stats::Distribution activeLanesPerGMemInstrDist; + Stats::Distribution activeLanesPerLMemInstrDist; + // number of vector ALU instructions received + Stats::Formula numALUInstsExecuted; + // number of times a WG can not start due to lack of free VGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueVgprAlloc; + Stats::Scalar numCASOps; + Stats::Scalar numFailedCASOps; + Stats::Scalar completedWfs; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer: Defined in the Scoreboard stage, consumed + // by the Execute stage. + std::vector<bool> vectorAluInstAvail; + // number of available (oldest) LDS instructions that could have + // been issued to the LDS at a specific issue slot + int shrMemInstAvail; + // number of available Global memory instructions that could have + // been issued to TCP at a specific issue slot + int glbMemInstAvail; + + void + regStats(); + + LdsState & + getLds() const + { + return lds; + } + + int32_t + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; + + bool + sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); + + typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; + pageDataStruct pageAccesses; + + class CUExitCallback : public Callback + { + private: + ComputeUnit *computeUnit; + + public: + virtual ~CUExitCallback() { } + + CUExitCallback(ComputeUnit *_cu) + { + computeUnit = _cu; + } + + virtual void + process(); + }; + + CUExitCallback *cuExitCallback; + + /** Data access Port **/ + class DataPort : public MasterPort + { + public: + DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + GPUDynInstPtr _gpuDynInst; + int port_index; + Packet::SenderState *saved; + + SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, + Packet::SenderState *sender_state=nullptr) + : _gpuDynInst(gpuDynInst), + port_index(_port_index), + saved(sender_state) { } + }; + + class MemReqEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemReqEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + class MemRespEvent : public Event + { + private: + DataPort *dataPort; + PacketPtr pkt; + + public: + MemRespEvent(DataPort *_data_port, PacketPtr _pkt) + : Event(), dataPort(_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + + }; + + // Instruction cache access port + class SQCPort : public MasterPort + { + public: + SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index) { } + + bool snoopRangeSent; + + struct SenderState : public Packet::SenderState + { + Wavefront *wavefront; + Packet::SenderState *saved; + + SenderState(Wavefront *_wavefront, Packet::SenderState + *sender_state=nullptr) + : wavefront(_wavefront), saved(sender_state) { } + }; + + std::deque<std::pair<PacketPtr, Wavefront*>> retries; + + protected: + ComputeUnit *computeUnit; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) + { + resp.clear(); + snoop = true; + } + }; + + /** Data TLB port **/ + class DTLBPort : public MasterPort + { + public: + DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) + : MasterPort(_name, _cu), computeUnit(_cu), + index(_index), stalled(false) + { } + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque<PacketPtr> retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // the memInst that this is associated with + GPUDynInstPtr _gpuDynInst; + + // the lane in the memInst this is associated with, so we send + // the memory request down the right port + int portIndex; + + // constructor used for packets involved in timing accesses + SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) + : _gpuDynInst(gpuDynInst), portIndex(port_index) { } + + }; + + protected: + ComputeUnit *computeUnit; + int index; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + class ITLBPort : public MasterPort + { + public: + ITLBPort(const std::string &_name, ComputeUnit *_cu) + : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } + + + bool isStalled() { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the translation requests that were + * not successfully sent. + */ + std::deque<PacketPtr> retries; + + /** SenderState is information carried along with the packet + * throughout the TLB hierarchy + */ + struct SenderState: public Packet::SenderState + { + // The wavefront associated with this request + Wavefront *wavefront; + + SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } + }; + + protected: + ComputeUnit *computeUnit; + bool stalled; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + /** + * the port intended to communicate between the CU and its LDS + */ + class LDSPort : public MasterPort + { + public: + LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) + : MasterPort(_name, _cu, _id), computeUnit(_cu) + { + } + + bool isStalled() const { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + /** + * here we queue all the requests that were + * not successfully sent. + */ + std::queue<PacketPtr> retries; + + /** + * SenderState is information carried along with the packet, esp. the + * GPUDynInstPtr + */ + class SenderState: public Packet::SenderState + { + protected: + // The actual read/write/atomic request that goes with this command + GPUDynInstPtr _gpuDynInst = nullptr; + + public: + SenderState(GPUDynInstPtr gpuDynInst): + _gpuDynInst(gpuDynInst) + { + } + + GPUDynInstPtr + getMemInst() const + { + return _gpuDynInst; + } + }; + + virtual bool + sendTimingReq(PacketPtr pkt); + + protected: + + bool stalled = false; ///< whether or not it is stalled + + ComputeUnit *computeUnit; + + virtual bool + recvTimingResp(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) { return 0; } + + virtual void + recvFunctional(PacketPtr pkt) + { + } + + virtual void + recvRangeChange() + { + } + + virtual void + recvReqRetry(); + }; + + /** The port to access the Local Data Store + * Can be connected to a LDS object + */ + LDSPort *ldsPort = nullptr; + + LDSPort * + getLdsPort() const + { + return ldsPort; + } + + /** The memory port for SIMD data accesses. + * Can be connected to PhysMem for Ruby for timing simulations + */ + std::vector<DataPort*> memPort; + // port to the TLB hierarchy (i.e., the L1 TLB) + std::vector<DTLBPort*> tlbPort; + // port to the SQC (i.e. the I-cache) + SQCPort *sqcPort; + // port to the SQC TLB (there's a separate TLB for each I-cache) + ITLBPort *sqcTLBPort; + + virtual BaseMasterPort& + getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "memory_port") { + memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *memPort[idx]; + } else if (if_name == "translation_port") { + tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *tlbPort[idx]; + } else if (if_name == "sqc_port") { + sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), + this, idx); + return *sqcPort; + } else if (if_name == "sqc_tlb_port") { + sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); + return *sqcTLBPort; + } else if (if_name == "ldsPort") { + if (ldsPort) { + fatal("an LDS port was already allocated"); + } + ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); + return *ldsPort; + } else { + panic("incorrect port name"); + } + } + + // xact_cas_load() + class waveIdentifier + { + public: + waveIdentifier() { } + waveIdentifier(int _simdId, int _wfSlotId) + : simdId(_simdId), wfSlotId(_wfSlotId) { } + + int simdId; + int wfSlotId; + }; + + class waveQueue + { + public: + std::list<waveIdentifier> waveIDQueue; + }; + std::map<unsigned, waveQueue> xactCasLoadMap; + + uint64_t getAndIncSeqNum() { return globalSeqNum++; } + + private: + uint64_t globalSeqNum; + int wavefrontSize; +}; + +#endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc new file mode 100644 index 000000000..f3f2d2927 --- /dev/null +++ b/src/gpu-compute/condition_register_state.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/condition_register_state.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +ConditionRegisterState::ConditionRegisterState() +{ + computeUnit = nullptr; + c_reg.clear(); + busy.clear(); +} + +void +ConditionRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".CondRegState"; +} + +void +ConditionRegisterState::init(uint32_t _size) +{ + c_reg.resize(_size); + busy.resize(_size, 0); +} + +void +ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w) +{ + // iterate over all operands + for (auto i = 0; i < ii->getNumOperands(); ++i) { + // is this a condition register destination operand? + if (ii->isCondRegister(i) && ii->isDstOperand(i)) { + // mark the register as busy + markReg(ii->getRegisterIndex(i), 1); + uint32_t pipeLen = w->computeUnit->spBypassLength(); + + // schedule an event for marking the register as ready + w->computeUnit-> + registerEvent(w->simdId, ii->getRegisterIndex(i), + ii->getOperandSize(i), + w->computeUnit->shader->tick_cnt + + w->computeUnit->shader->ticks(pipeLen), 0); + } + } +} diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh new file mode 100644 index 000000000..139874a66 --- /dev/null +++ b/src/gpu-compute/condition_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __CONDITION_REGISTER_STATE_HH__ +#define __CONDITION_REGISTER_STATE_HH__ + +#include <string> +#include <vector> + +#include "gpu-compute/misc.hh" + +class ComputeUnit; +class GPUStaticInst; +class Shader; +class Wavefront; + +// Condition Register State (used only when executing HSAIL) +class ConditionRegisterState +{ + public: + ConditionRegisterState(); + void init(uint32_t _size); + const std::string name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + template<typename T> + T + read(int regIdx, int threadId) + { + bool tmp = c_reg[regIdx][threadId]; + T *p0 = (T*)(&tmp); + + return *p0; + } + + template<typename T> + void + write(int regIdx, int threadId, T value) + { + c_reg[regIdx][threadId] = (bool)(value & 0x01); + } + + void + markReg(int regIdx, uint8_t value) + { + busy.at(regIdx) = value; + } + + uint8_t + regBusy(int idx) + { + uint8_t status = busy.at(idx); + return status; + } + + int numRegs() { return c_reg.size(); } + void exec(GPUStaticInst *ii, Wavefront *w); + + private: + ComputeUnit* computeUnit; + std::string _name; + // Condition Register state + std::vector<VectorMask> c_reg; + // flag indicating if a register is busy + std::vector<uint8_t> busy; +}; + +#endif diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc new file mode 100644 index 000000000..55e4be72a --- /dev/null +++ b/src/gpu-compute/dispatcher.cc @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + + +#include "gpu-compute/dispatcher.hh" + +#include "cpu/base.hh" +#include "debug/GPUDisp.hh" +#include "gpu-compute/cl_driver.hh" +#include "gpu-compute/cl_event.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet_access.hh" + +GpuDispatcher *GpuDispatcher::instance = nullptr; + +GpuDispatcher::GpuDispatcher(const Params *p) + : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), + pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), + dispatchCount(0), dispatchActive(false), cpu(p->cpu), + shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this) +{ + shader->handshake(this); + driver->handshake(this); + + ndRange.wg_disp_rem = false; + ndRange.globalWgId = 0; + + schedule(&tickEvent, 0); + + // translation port for the dispatcher + tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); + + num_kernelLaunched + .name(name() + ".num_kernel_launched") + .desc("number of kernel launched") + ; +} + +GpuDispatcher *GpuDispatcherParams::create() +{ + GpuDispatcher *dispatcher = new GpuDispatcher(this); + GpuDispatcher::setInstance(dispatcher); + + return GpuDispatcher::getInstance(); +} + +void +GpuDispatcher::serialize(CheckpointOut &cp) const +{ + Tick event_tick = 0; + + if (ndRange.wg_disp_rem) + fatal("Checkpointing not supported during active workgroup execution"); + + if (tickEvent.scheduled()) + event_tick = tickEvent.when(); + + SERIALIZE_SCALAR(event_tick); + +} + +void +GpuDispatcher::unserialize(CheckpointIn &cp) +{ + Tick event_tick; + + if (tickEvent.scheduled()) + deschedule(&tickEvent); + + UNSERIALIZE_SCALAR(event_tick); + + if (event_tick) + schedule(&tickEvent, event_tick); +} + +AddrRangeList +GpuDispatcher::getAddrRanges() const +{ + AddrRangeList ranges; + + DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", + pioAddr, pioSize); + + ranges.push_back(RangeSize(pioAddr, pioSize)); + + return ranges; +} + +Tick +GpuDispatcher::read(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + pkt->allocate(); + + DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); + + if (offset < 8) { + assert(!offset); + assert(pkt->getSize() == 8); + + uint64_t retval = dispatchActive; + pkt->set(retval); + } else { + offset -= 8; + assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + + memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + +Tick +GpuDispatcher::write(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr); + assert(pkt->getAddr() < pioAddr + pioSize); + + int offset = pkt->getAddr() - pioAddr; + +#if TRACING_ON + uint64_t data_val = 0; + + switch (pkt->getSize()) { + case 1: + data_val = pkt->get<uint8_t>(); + break; + case 2: + data_val = pkt->get<uint16_t>(); + break; + case 4: + data_val = pkt->get<uint32_t>(); + break; + case 8: + data_val = pkt->get<uint64_t>(); + break; + default: + DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); + } + + DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, + pkt->getSize()); +#endif + if (!offset) { + static int nextId = 0; + + // The depends field of the qstruct, which was previously unused, is + // used to communicate with simulated application. + if (curTask.depends) { + HostState hs; + shader->ReadMem((uint64_t)(curTask.depends), &hs, + sizeof(HostState), 0); + + // update event start time (in nano-seconds) + uint64_t start = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), + &start, sizeof(uint64_t), 0); + } + + // launch kernel + ++num_kernelLaunched; + + NDRange *ndr = &(ndRangeMap[nextId]); + // copy dispatch info + ndr->q = curTask; + + // update the numDispTask polled by the runtime + accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); + + ndr->numWgTotal = 1; + + for (int i = 0; i < 3; ++i) { + ndr->wgId[i] = 0; + ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); + ndr->numWgTotal *= ndr->numWg[i]; + } + + ndr->numWgCompleted = 0; + ndr->globalWgId = 0; + ndr->wg_disp_rem = true; + ndr->execDone = false; + ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; + ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; + ndr->dispatchId = nextId; + ndr->curTid = pkt->req->threadId(); + DPRINTF(GPUDisp, "launching kernel %d\n",nextId); + execIds.push(nextId); + ++nextId; + + dispatchActive = true; + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } + } else { + // populate current task struct + // first 64 bits are launch reg + offset -= 8; + assert(offset < sizeof(HsaQueueEntry)); + char *curTaskPtr = (char*)&curTask; + memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); + } + + pkt->makeAtomicResponse(); + + return pioDelay; +} + + +BaseMasterPort& +GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "translation_port") { + return *tlbPort; + } + + return DmaDevice::getMasterPort(if_name, idx); +} + +void +GpuDispatcher::exec() +{ + int fail_count = 0; + + // There are potentially multiple outstanding kernel launches. + // It is possible that the workgroups in a different kernel + // can fit on the GPU even if another kernel's workgroups cannot + DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); + + while (execIds.size() > fail_count) { + int execId = execIds.front(); + + while (ndRangeMap[execId].wg_disp_rem) { + //update the thread context + shader->updateThreadContext(ndRangeMap[execId].curTid); + + // attempt to dispatch_workgroup + if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { + // if we failed try the next kernel, + // it may have smaller workgroups. + // put it on the queue to rety latter + DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); + execIds.push(execId); + ++fail_count; + break; + } + } + // let's try the next kernel_id + execIds.pop(); + } + + DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); + + if (doneIds.size() && cpu) { + shader->hostWakeUp(cpu); + } + + while (doneIds.size()) { + // wakeup the CPU if any Kernels completed this cycle + DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); + doneIds.pop(); + } +} + +void +GpuDispatcher::notifyWgCompl(Wavefront *w) +{ + int kern_id = w->kern_id; + DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); + assert(ndRangeMap[kern_id].dispatchId == kern_id); + ndRangeMap[kern_id].numWgCompleted++; + + if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { + ndRangeMap[kern_id].execDone = true; + doneIds.push(kern_id); + + if (ndRangeMap[kern_id].addrToNotify) { + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, + 0); + } + + accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); + + // update event end time (in nano-seconds) + if (ndRangeMap[kern_id].q.depends) { + HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; + uint64_t event; + shader->ReadMem((uint64_t)(&host_state->event), &event, + sizeof(uint64_t), 0); + + uint64_t end = curTick() / 1000; + + shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, + sizeof(uint64_t), 0); + } + } + + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->ticks(1)); + } +} + +void +GpuDispatcher::scheduleDispatch() +{ + if (!tickEvent.scheduled()) + schedule(&tickEvent, curTick() + shader->ticks(1)); +} + +void +GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) +{ + if (cpu) { + if (off) { + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, + true); + val += off; + } + + shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); + } else { + panic("Cannot find host"); + } +} + +GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher) + : Event(CPU_Tick_Pri), dispatcher(_dispatcher) +{ +} + +void +GpuDispatcher::TickEvent::process() +{ + dispatcher->exec(); +} + +const char* +GpuDispatcher::TickEvent::description() const +{ + return "GPU Dispatcher tick"; +} + +// helper functions for driver to retrieve GPU attributes +int +GpuDispatcher::getNumCUs() +{ + return shader->cuList.size(); +} + +void +GpuDispatcher::setFuncargsSize(int funcargs_size) +{ + shader->funcargs_size = funcargs_size; +} diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh new file mode 100644 index 000000000..76f932655 --- /dev/null +++ b/src/gpu-compute/dispatcher.hh @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __GPU_DISPATCHER_HH__ +#define __GPU_DISPATCHER_HH__ + +#include <queue> +#include <vector> + +#include "base/statistics.hh" +#include "dev/dma_device.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/ndrange.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/port.hh" +#include "params/GpuDispatcher.hh" + +class BaseCPU; +class Shader; + +class GpuDispatcher : public DmaDevice +{ + public: + typedef GpuDispatcherParams Params; + + class TickEvent : public Event + { + private: + GpuDispatcher *dispatcher; + + public: + TickEvent(GpuDispatcher *); + void process(); + const char *description() const; + }; + + MasterID masterId() { return _masterId; } + + protected: + MasterID _masterId; + + // Base and length of PIO register space + Addr pioAddr; + Addr pioSize; + Tick pioDelay; + + HsaQueueEntry curTask; + + std::unordered_map<int, NDRange> ndRangeMap; + NDRange ndRange; + + // list of kernel_ids to launch + std::queue<int> execIds; + // list of kernel_ids that have finished + std::queue<int> doneIds; + + uint64_t dispatchCount; + // is there a kernel in execution? + bool dispatchActive; + + BaseCPU *cpu; + Shader *shader; + ClDriver *driver; + TickEvent tickEvent; + + static GpuDispatcher *instance; + + // sycall emulation mode can have only 1 application running(?) + // else we have to do some pid based tagging + // unused + typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer; + TranslationBuffer tlb; + + public: + /*statistics*/ + Stats::Scalar num_kernelLaunched; + GpuDispatcher(const Params *p); + + ~GpuDispatcher() { } + + void exec(); + virtual void serialize(CheckpointOut &cp) const; + virtual void unserialize(CheckpointIn &cp); + void notifyWgCompl(Wavefront *w); + void scheduleDispatch(); + void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off); + + // using singleton so that glue code can pass pointer locations + // to the dispatcher. when there are multiple dispatchers, we can + // call something like getInstance(index) + static void + setInstance(GpuDispatcher *_instance) + { + instance = _instance; + } + + static GpuDispatcher* getInstance() { return instance; } + + class TLBPort : public MasterPort + { + public: + + TLBPort(const std::string &_name, GpuDispatcher *_dispatcher) + : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { } + + protected: + GpuDispatcher *dispatcher; + + virtual bool recvTimingResp(PacketPtr pkt) { return true; } + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry() { } + + }; + + TLBPort *tlbPort; + + virtual BaseMasterPort& getMasterPort(const std::string &if_name, + PortID idx); + + AddrRangeList getAddrRanges() const; + Tick read(PacketPtr pkt); + Tick write(PacketPtr pkt); + + // helper functions to retrieve/set GPU attributes + int getNumCUs(); + void setFuncargsSize(int funcargs_size); +}; + +#endif // __GPU_DISPATCHER_HH__ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc new file mode 100644 index 000000000..c2b95f85e --- /dev/null +++ b/src/gpu-compute/exec_stage.cc @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/exec_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr), lastTimeInstExecuted(false), + thisTimeInstExecuted(false), instrExecuted (false), + executionResourcesUsed(0) +{ + numTransActiveIdle = 0; + idle_dur = 0; +} + +void +ExecStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ExecStage"; + dispatchList = &computeUnit->dispatchList; + vectorAluInstAvail = &(computeUnit->vectorAluInstAvail); + glbMemInstAvail= &(computeUnit->glbMemInstAvail); + shrMemInstAvail= &(computeUnit->shrMemInstAvail); + idle_dur = 0; +} + +void +ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { + if (stage == IdleExec) { + // count cycles of no vector ALU instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) { + numCyclesWithNoInstrTypeIssued[unitId]++; + } + + // count cycles of no global memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*glbMemInstAvail)--; + } + + // count cycles of no shared memory (vector) instruction executed + // even if one was the oldest in a WV of that vector SIMD unit + if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) { + numCyclesWithNoInstrTypeIssued[unitId]++; + (*shrMemInstAvail)--; + } + } else if (stage == BusyExec) { + // count the number of cycles an instruction to a specific unit + // was issued + numCyclesWithInstrTypeIssued[unitId]++; + thisTimeInstExecuted = true; + instrExecuted = true; + ++executionResourcesUsed; + } else if (stage == PostExec) { + // count the number of transitions from active to idle + if (lastTimeInstExecuted && !thisTimeInstExecuted) { + ++numTransActiveIdle; + } + + if (!lastTimeInstExecuted && thisTimeInstExecuted) { + idleDur.sample(idle_dur); + idle_dur = 0; + } else if (!thisTimeInstExecuted) { + idle_dur++; + } + + lastTimeInstExecuted = thisTimeInstExecuted; + // track the number of cycles we either issued one vector instruction + // or issued no instructions at all + if (instrExecuted) { + numCyclesWithInstrIssued++; + } else { + numCyclesWithNoIssue++; + } + + spc.sample(executionResourcesUsed); + } +} + +void +ExecStage::initStatistics() +{ + instrExecuted = false; + executionResourcesUsed = 0; + thisTimeInstExecuted = false; +} + +void +ExecStage::exec() +{ + initStatistics(); + + for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) { + // if dispatch list for this execution resource is empty, + // skip this execution resource this cycle + if (dispatchList->at(unitId).second == EMPTY) { + collectStatistics(IdleExec, unitId); + continue; + } + + collectStatistics(BusyExec, unitId); + // execute an instruction for the WF + dispatchList->at(unitId).first->exec(); + // clear the dispatch list entry + dispatchList->at(unitId).second = EMPTY; + dispatchList->at(unitId).first = (Wavefront*)nullptr; + } + + collectStatistics(PostExec, 0); +} + +void +ExecStage::regStats() +{ + numTransActiveIdle + .name(name() + ".num_transitions_active_to_idle") + .desc("number of CU transitions from active to idle") + ; + + numCyclesWithNoIssue + .name(name() + ".num_cycles_with_no_issue") + .desc("number of cycles the CU issues nothing") + ; + + numCyclesWithInstrIssued + .name(name() + ".num_cycles_with_instr_issued") + .desc("number of cycles the CU issued at least one instruction") + ; + + spc + .init(0, numSIMDs + numMemUnits, 1) + .name(name() + ".spc") + .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") + ; + + idleDur + .init(0,75,5) + .name(name() + ".idle_duration_in_cycles") + .desc("duration of idle periods in cycles") + ; + + numCyclesWithInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instrtype_issue") + .desc("Number of cycles at least one instruction of specific type " + "issued") + ; + + numCyclesWithNoInstrTypeIssued + .init(numSIMDs + numMemUnits) + .name(name() + ".num_cycles_with_instr_type_no_issue") + .desc("Number of cycles no instruction of specific type issued") + ; + + for (int i = 0; i < numSIMDs; ++i) { + numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + } + + numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM")); + numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); + numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); +} diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh new file mode 100644 index 000000000..2de74366b --- /dev/null +++ b/src/gpu-compute/exec_stage.hh @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __EXEC_STAGE_HH__ +#define __EXEC_STAGE_HH__ + +#include <string> +#include <utility> +#include <vector> + +#include "sim/stats.hh" + +class ComputeUnit; +class Wavefront; +struct ComputeUnitParams; + +enum STAT_STATUS +{ + IdleExec, + BusyExec, + PostExec +}; + +enum DISPATCH_STATUS +{ + EMPTY = 0, + FILLED +}; + +// Execution stage. +// Each execution resource executes the +// wave which is in its dispatch list. +// The schedule stage is responsible for +// adding a wave into each execution resource's +// dispatch list. + +class ExecStage +{ + public: + ExecStage(const ComputeUnitParams* params); + ~ExecStage() { } + void init(ComputeUnit *cu); + void exec(); + + std::string name() { return _name; } + void regStats(); + // number of idle cycles + Stats::Scalar numCyclesWithNoIssue; + // number of busy cycles + Stats::Scalar numCyclesWithInstrIssued; + // number of cycles (per execution unit) during which at least one + // instruction was issued to that unit + Stats::Vector numCyclesWithInstrTypeIssued; + // number of idle cycles (per execution unit) during which the unit issued + // no instruction targeting that unit, even though there is at least one + // Wavefront with such an instruction as the oldest + Stats::Vector numCyclesWithNoInstrTypeIssued; + // SIMDs active per cycle + Stats::Distribution spc; + + private: + void collectStatistics(enum STAT_STATUS stage, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + + // Number of memory execution resources; + // both global and local memory execution resources in CU + uint32_t numMemUnits; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + // dispatchList is used to communicate between schedule + // and exec stage + std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList; + // flag per vector SIMD unit that is set when there is at least one + // WV that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector<bool> *vectorAluInstAvail; + int *glbMemInstAvail; + int *shrMemInstAvail; + bool lastTimeInstExecuted; + bool thisTimeInstExecuted; + bool instrExecuted; + Stats::Scalar numTransActiveIdle; + Stats::Distribution idleDur; + uint32_t executionResourcesUsed; + uint64_t idle_dur; + std::string _name; +}; + +#endif // __EXEC_STAGE_HH__ diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc new file mode 100644 index 000000000..1f5e6ded3 --- /dev/null +++ b/src/gpu-compute/fetch_stage.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/wavefront.hh" + +FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs), + computeUnit(nullptr) +{ + for (int j = 0; j < numSIMDs; ++j) { + FetchUnit newFetchUnit(p); + fetchUnit.push_back(newFetchUnit); + } +} + +FetchStage::~FetchStage() +{ + fetchUnit.clear(); +} + +void +FetchStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".FetchStage"; + + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); + fetchUnit[j].init(computeUnit); + } +} + +void +FetchStage::exec() +{ + for (int j = 0; j < numSIMDs; ++j) { + fetchUnit[j].exec(); + } +} + +void +FetchStage::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + const unsigned num_instructions = pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); + + instFetchInstReturned.sample(num_instructions); + uint32_t simdId = wavefront->simdId; + fetchUnit[simdId].processFetchReturn(pkt); +} + +void +FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + fetchUnit[wavefront->simdId].fetch(pkt, wavefront); +} + +void +FetchStage::regStats() +{ + instFetchInstReturned + .init(1, 32, 1) + .name(name() + ".inst_fetch_instr_returned") + .desc("For each instruction fetch request recieved record how many " + "instructions you got from it") + ; +} diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh new file mode 100644 index 000000000..ce7faa8ac --- /dev/null +++ b/src/gpu-compute/fetch_stage.hh @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez, Sooraj Puthoor + */ + +#ifndef __FETCH_STAGE_HH__ +#define __FETCH_STAGE_HH__ + +#include <string> +#include <vector> + +#include "gpu-compute/fetch_unit.hh" + +// Instruction fetch stage. +// All dispatched wavefronts for all SIMDS are analyzed for the +// need to fetch instructions. From the fetch eligible waves, +// one wave is selected from each SIMD and fetch is initiated +// for the selected waves. + +class ComputeUnit; +class Wavefront; + +class FetchStage +{ + public: + FetchStage(const ComputeUnitParams* params); + ~FetchStage(); + void init(ComputeUnit *cu); + void exec(); + void processFetchReturn(PacketPtr pkt); + void fetch(PacketPtr pkt, Wavefront *wave); + + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + Stats::Distribution instFetchInstReturned; + + private: + uint32_t numSIMDs; + ComputeUnit *computeUnit; + + // List of fetch units. A fetch unit is + // instantiated per SIMD + std::vector<FetchUnit> fetchUnit; + std::string _name; +}; + +#endif // __FETCH_STAGE_HH__ diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc new file mode 100644 index 000000000..1f0a7d78e --- /dev/null +++ b/src/gpu-compute/fetch_unit.cc @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#include "gpu-compute/fetch_unit.hh" + +#include "debug/GPUFetch.hh" +#include "debug/GPUPort.hh" +#include "debug/GPUTLB.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/ruby/system/RubySystem.hh" + +uint32_t FetchUnit::globalFetchUnitID; + +FetchUnit::FetchUnit(const ComputeUnitParams* params) : + timingSim(true), + computeUnit(nullptr), + fetchScheduler(params), + waveList(nullptr) +{ +} + +FetchUnit::~FetchUnit() +{ + fetchQueue.clear(); + fetchStatusQueue.clear(); +} + +void +FetchUnit::init(ComputeUnit *cu) +{ + computeUnit = cu; + timingSim = computeUnit->shader->timingSim; + fetchQueue.clear(); + fetchStatusQueue.resize(computeUnit->shader->n_wf); + + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); + } + + fetchScheduler.bindList(&fetchQueue); +} + +void +FetchUnit::exec() +{ + // re-evaluate waves which are marked as not ready for fetch + for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + // Following code assumes 64-bit opertaion and all insts are + // represented by 64-bit pointers to inst objects. + Wavefront *curWave = fetchStatusQueue[j].first; + assert (curWave); + + // The wavefront has to be active, the IB occupancy has to be + // 4 or less instructions and it can not have any branches to + // prevent speculative instruction fetches + if (!fetchStatusQueue[j].second) { + if (curWave->status == Wavefront::S_RUNNING && + curWave->instructionBuffer.size() <= 4 && + !curWave->instructionBufferHasBranch() && + !curWave->pendingFetch) { + fetchQueue.push_back(curWave); + fetchStatusQueue[j].second = true; + } + } + } + + // Fetch only if there is some wave ready to be fetched + // An empty fetchQueue will cause the schedular to panic + if (fetchQueue.size()) { + Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); + waveToBeFetched->pendingFetch = true; + fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; + initiateFetch(waveToBeFetched); + } +} + +void +FetchUnit::initiateFetch(Wavefront *wavefront) +{ + // calculate the virtual address to fetch from the SQC + Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size(); + vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); + + // Since this is an instruction prefetch, if you're split then just finish + // out the current line. + unsigned block_size = RubySystem::getBlockSizeBytes(); + // check for split accesses + Addr split_addr = roundDown(vaddr + block_size - 1, block_size); + unsigned size = block_size; + + if (split_addr > vaddr) { + // misaligned access, just grab the rest of the line + size = split_addr - vaddr; + } + + // set up virtual request + Request *req = new Request(0, vaddr, size, Request::INST_FETCH, + computeUnit->masterId(), 0, 0, 0); + + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + // This fetchBlock is kind of faux right now - because the translations so + // far don't actually return Data + uint64_t fetchBlock; + pkt->dataStatic(&fetchBlock); + + if (timingSim) { + // SenderState needed on Return + pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); + + // Sender State needed by TLB hierarchy + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc, + false, pkt->senderState); + + if (computeUnit->sqcTLBPort->isStalled()) { + assert(computeUnit->sqcTLBPort->retries.size() > 0); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { + // Stall the data port; + // No more packet is issued till + // ruby indicates resources are freed by + // a recvReqRetry() call back on this port. + computeUnit->sqcTLBPort->stallPort(); + + DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", + vaddr); + + computeUnit->sqcTLBPort->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); + } + } else { + pkt->senderState = + new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, + computeUnit->shader->gpuTc); + + computeUnit->sqcTLBPort->sendFunctional(pkt); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + delete sender_state->tlbEntry; + delete sender_state; + // fetch the instructions from the SQC when we operate in + // functional mode only + fetch(pkt, wavefront); + } +} + +void +FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) +{ + assert(pkt->req->hasPaddr()); + assert(pkt->req->hasSize()); + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + + // this is necessary because the GPU TLB receives packets instead of + // requests. when the translation is complete, all relevent fields in the + // request will be populated, but not in the packet. here we create the + // new packet so we can set the size, addr, and proper flags. + PacketPtr oldPkt = pkt; + pkt = new Packet(oldPkt->req, oldPkt->cmd); + delete oldPkt; + + TheGpuISA::RawMachInst *data = + new TheGpuISA::RawMachInst[pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)]; + + pkt->dataDynamic<TheGpuISA::RawMachInst>(data); + + // New SenderState for the memory access + pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); + + if (timingSim) { + // translation is done. Send the appropriate timing memory request. + + if (!computeUnit->sqcPort->sendTimingReq(pkt)) { + computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, + wavefront)); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + pkt->req->getPaddr()); + } + } else { + computeUnit->sqcPort->sendFunctional(pkt); + processFetchReturn(pkt); + } +} + +void +FetchUnit::processFetchReturn(PacketPtr pkt) +{ + ComputeUnit::SQCPort::SenderState *sender_state = + safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); + + Wavefront *wavefront = sender_state->wavefront; + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " + "%d bytes, %d instructions!\n", computeUnit->cu_id, + wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), + pkt->req->getSize(), pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst)); + + if (wavefront->dropFetch) { + assert(wavefront->instructionBuffer.empty()); + wavefront->dropFetch = false; + } else { + TheGpuISA::RawMachInst *inst_index_ptr = + (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>(); + + assert(wavefront->instructionBuffer.size() <= 4); + + for (int i = 0; i < pkt->req->getSize() / + sizeof(TheGpuISA::RawMachInst); ++i) { + GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); + + assert(inst_ptr); + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", + computeUnit->cu_id, wavefront->simdId, + wavefront->wfSlotId, inst_ptr->disassemble()); + + GPUDynInstPtr gpuDynInst = + std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr, + computeUnit->getAndIncSeqNum()); + + wavefront->instructionBuffer.push_back(gpuDynInst); + } + } + + wavefront->pendingFetch = false; + + delete pkt->senderState; + delete pkt->req; + delete pkt; +} + +void +FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list) +{ + waveList = wave_list; +} diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh new file mode 100644 index 000000000..c7c6afb3c --- /dev/null +++ b/src/gpu-compute/fetch_unit.hh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Sooraj Puthoor + */ + +#ifndef __FETCH_UNIT_HH__ +#define __FETCH_UNIT_HH__ + +#include <string> +#include <utility> +#include <vector> + +#include "arch/gpu_decoder.hh" +#include "base/statistics.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/scheduler.hh" +#include "mem/packet.hh" + +class ComputeUnit; +class Wavefront; + +class FetchUnit +{ + public: + FetchUnit(const ComputeUnitParams* params); + ~FetchUnit(); + void init(ComputeUnit *cu); + void exec(); + void bindWaveList(std::vector<Wavefront*> *list); + void initiateFetch(Wavefront *wavefront); + void fetch(PacketPtr pkt, Wavefront *wavefront); + void processFetchReturn(PacketPtr pkt); + static uint32_t globalFetchUnitID; + + private: + bool timingSim; + ComputeUnit *computeUnit; + TheGpuISA::Decoder decoder; + + // Fetch scheduler; Selects one wave from + // the fetch queue for instruction fetching. + // The selection is made according to + // a scheduling policy + Scheduler fetchScheduler; + + // Stores the list of waves that are + // ready to be fetched this cycle + std::vector<Wavefront*> fetchQueue; + + // Stores the fetch status of all waves dispatched to this SIMD. + // TRUE implies the wave is ready to fetch and is already + // moved to fetchQueue + std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue; + + // Pointer to list of waves dispatched on to this SIMD unit + std::vector<Wavefront*> *waveList; +}; + +#endif // __FETCH_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc new file mode 100644 index 000000000..913327412 --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#include "gpu-compute/global_memory_pipeline.hh" + +#include "debug/GPUMem.hh" +#include "debug/GPUReg.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), + inflightStores(0), inflightLoads(0) +{ +} + +void +GlobalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + globalMemSize = computeUnit->shader->globalMemSize; + _name = computeUnit->name() + ".GlobalMemPipeline"; +} + +void +GlobalMemPipeline::exec() +{ + // apply any returned global memory operations + GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() : + !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; + + bool accessVrf = true; + // check the VRF to see if the operands of a load (or load component + // of an atomic) are accessible + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) && + m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && + accessVrf && m->statusBitVector == VectorMask(0) && + (computeUnit->shader->coissue_return || + computeUnit->wfWait.at(m->pipeId).rdy())) { + + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doGmReturn<uint32_t, uint8_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doGmReturn<uint32_t, uint16_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doGmReturn<uint32_t, uint32_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doGmReturn<int32_t, int8_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doGmReturn<int32_t, int16_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doGmReturn<int32_t, int32_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doGmReturn<float, Float16>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doGmReturn<float, float>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doGmReturn<uint64_t, uint8_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doGmReturn<uint64_t, uint16_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doGmReturn<uint64_t, uint32_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doGmReturn<uint64_t, uint64_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doGmReturn<int64_t, int8_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doGmReturn<int64_t, int16_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doGmReturn<int64_t, int32_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doGmReturn<int64_t, int64_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doGmReturn<double, Float16>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doGmReturn<double, float>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doGmReturn<double, double>(m); + } + + // If pipeline has executed a global memory instruction + // execute global memory packets and issue global + // memory packets to DTLB + if (!gmIssuedRequests.empty()) { + GPUDynInstPtr mp = gmIssuedRequests.front(); + if (mp->m_op == Enums::MO_LD || + (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || + (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { + + if (inflightLoads >= gmQueueSize) { + return; + } else { + ++inflightLoads; + } + } else { + if (inflightStores >= gmQueueSize) { + return; + } else { + ++inflightStores; + } + } + + mp->initiateAcc(mp); + gmIssuedRequests.pop(); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId, + Enums::MemOpTypeStrings[mp->m_op]); + } +} + +template<typename c0, typename c1> +void +GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) +{ + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + gmReturnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector<uint32_t> regVec; + // iterate over number of destination register operands since + // this is a load or atomic operation + for (int k = 0; k < m->n_reg; ++k) { + assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST); + int dst = m->dst_reg + k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst, sizeof(c0), 1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: " + "$%s%d <- %d global ld done (src = wavefront " + "ld inst)\n", w->computeUnit->cu_id, w->simdId, + w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d", + dst, *p1); + // write the value into the physical VGPR. This is a + // purely functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. + // This simply models the timing aspect of the VRF write operation. + // It does not modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), + w, regVec, sizeof(c0), + m->time); + } + } else { + gmReturnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || + MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time, + -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time, + -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->glbMemToVrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +GlobalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles GM data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh new file mode 100644 index 000000000..ed49f6f6b --- /dev/null +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Sooraj Puthoor + */ + +#ifndef __GLOBAL_MEMORY_PIPELINE_HH__ +#define __GLOBAL_MEMORY_PIPELINE_HH__ + +#include <queue> +#include <string> + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file global_memory_pipeline.hh + * + * The global memory pipeline issues newly created global memory packets + * from the pipeline to DTLB. The exec() method of the memory packet issues + * the packet to the DTLB if there is space available in the return fifo. + * This stage also retires previously issued loads and stores that have + * returned from the memory sub-system. + */ + +class ComputeUnit; + +class GlobalMemPipeline +{ + public: + GlobalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m); + + std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; } + std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; } + std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; } + + bool + isGMLdRespFIFOWrRdy() const + { + return gmReturnedLoads.size() < gmQueueSize; + } + + bool + isGMStRespFIFOWrRdy() const + { + return gmReturnedStores.size() < gmQueueSize; + } + + bool + isGMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (gmIssuedRequests.size() + pendReqs) < gmQueueSize; + } + + const std::string &name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int gmQueueSize; + + // number of cycles of delaying the update of a VGPR that is the + // target of a load instruction (or the load component of an atomic) + // The delay is due to VRF bank conflicts + Stats::Scalar loadVrfBankConflictCycles; + // Counters to track the inflight loads and stores + // so that we can provide the proper backpressure + // on the number of inflight memory operations. + int inflightStores; + int inflightLoads; + + // The size of global memory. + int globalMemSize; + + // Global Memory Request FIFO: all global memory requests + // are issued to this FIFO from the memory pipelines + std::queue<GPUDynInstPtr> gmIssuedRequests; + + // Globa Store Response FIFO: all responses of global memory + // stores are sent to this FIFO from TCP + std::queue<GPUDynInstPtr> gmReturnedStores; + + // Global Load Response FIFO: all responses of global memory + // loads are sent to this FIFO from TCP + std::queue<GPUDynInstPtr> gmReturnedLoads; +}; + +#endif // __GLOBAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc new file mode 100644 index 000000000..83e348dbe --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_dyn_inst.hh" + +#include "debug/GPUMem.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, + GPUStaticInst *_staticInst, uint64_t instSeqNum) + : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF), + memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false), + statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) +{ + tlbHitLevel.assign(VSZ, -1); +} + +void +GPUDynInst::execute() +{ + GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst, + _seqNum); + staticInst->execute(gpuDynInst); +} + +int +GPUDynInst::numSrcRegOperands() +{ + return staticInst->numSrcRegOperands(); +} + +int +GPUDynInst::numDstRegOperands() +{ + return staticInst->numDstRegOperands(); +} + +int +GPUDynInst::getNumOperands() +{ + return staticInst->getNumOperands(); +} + +bool +GPUDynInst::isVectorRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +bool +GPUDynInst::isScalarRegister(int operandIdx) +{ + return staticInst->isVectorRegister(operandIdx); +} + +int +GPUDynInst::getRegisterIndex(int operandIdx) +{ + return staticInst->getRegisterIndex(operandIdx); +} + +int +GPUDynInst::getOperandSize(int operandIdx) +{ + return staticInst->getOperandSize(operandIdx); +} + +bool +GPUDynInst::isDstOperand(int operandIdx) +{ + return staticInst->isDstOperand(operandIdx); +} + +bool +GPUDynInst::isSrcOperand(int operandIdx) +{ + return staticInst->isSrcOperand(operandIdx); +} + +bool +GPUDynInst::isArgLoad() +{ + return staticInst->isArgLoad(); +} + +const std::string& +GPUDynInst::disassemble() const +{ + return staticInst->disassemble(); +} + +uint64_t +GPUDynInst::seqNum() const +{ + return _seqNum; +} + +Enums::OpType +GPUDynInst::opType() +{ + return staticInst->o_type; +} + +Enums::StorageClassType +GPUDynInst::executedAs() +{ + return staticInst->executed_as; +} + +// Process a memory instruction and (if necessary) submit timing request +void +GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) +{ + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", + cu->cu_id, simdId, wfSlotId, exec_mask); + + staticInst->initiateAcc(gpuDynInst); + time = 0; +} + +bool +GPUDynInst::scalarOp() const +{ + return staticInst->scalarOp(); +} + +void +GPUDynInst::updateStats() +{ + if (staticInst->isLocalMem()) { + // access to LDS (shared) memory + cu->dynamicLMemInstrCnt++; + } else { + // access to global memory + + // update PageDivergence histogram + int number_pages_touched = cu->pagesTouched.size(); + assert(number_pages_touched); + cu->pageDivergenceDist.sample(number_pages_touched); + + std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret; + + for (auto it : cu->pagesTouched) { + // see if this page has been touched before. if not, this also + // inserts the page into the table. + ret = cu->pageAccesses + .insert(ComputeUnit::pageDataStruct::value_type(it.first, + std::make_pair(1, it.second))); + + // if yes, then update the stats + if (!ret.second) { + ret.first->second.first++; + ret.first->second.second += it.second; + } + } + + cu->pagesTouched.clear(); + + // total number of memory instructions (dynamic) + // Atomics are counted as a single memory instruction. + // this is # memory instructions per wavefronts, not per workitem + cu->dynamicGMemInstrCnt++; + } +} diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh new file mode 100644 index 000000000..e44d8f80d --- /dev/null +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_DYN_INST_HH__ +#define __GPU_DYN_INST_HH__ + +#include <cstdint> +#include <string> + +#include "enums/GenericMemoryOrder.hh" +#include "enums/GenericMemoryScope.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_exec_context.hh" + +class GPUStaticInst; + +template<typename T> +class AtomicOpAnd : public TypedAtomicOpFunctor<T> +{ + public: + T a; + + AtomicOpAnd(T _a) : a(_a) { } + void execute(T *b) { *b &= a; } +}; + +template<typename T> +class AtomicOpOr : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpOr(T _a) : a(_a) { } + void execute(T *b) { *b |= a; } +}; + +template<typename T> +class AtomicOpXor : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpXor(T _a) : a(_a) {} + void execute(T *b) { *b ^= a; } +}; + +template<typename T> +class AtomicOpCAS : public TypedAtomicOpFunctor<T> +{ + public: + T c; + T s; + + ComputeUnit *computeUnit; + + AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) + : c(_c), s(_s), computeUnit(compute_unit) { } + + void + execute(T *b) + { + computeUnit->numCASOps++; + + if (*b == c) { + *b = s; + } else { + computeUnit->numFailedCASOps++; + } + + if (computeUnit->xact_cas_mode) { + computeUnit->xactCasLoadMap.clear(); + } + } +}; + +template<typename T> +class AtomicOpExch : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpExch(T _a) : a(_a) { } + void execute(T *b) { *b = a; } +}; + +template<typename T> +class AtomicOpAdd : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpAdd(T _a) : a(_a) { } + void execute(T *b) { *b += a; } +}; + +template<typename T> +class AtomicOpSub : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpSub(T _a) : a(_a) { } + void execute(T *b) { *b -= a; } +}; + +template<typename T> +class AtomicOpInc : public TypedAtomicOpFunctor<T> +{ + public: + AtomicOpInc() { } + void execute(T *b) { *b += 1; } +}; + +template<typename T> +class AtomicOpDec : public TypedAtomicOpFunctor<T> +{ + public: + AtomicOpDec() {} + void execute(T *b) { *b -= 1; } +}; + +template<typename T> +class AtomicOpMax : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpMax(T _a) : a(_a) { } + + void + execute(T *b) + { + if (a > *b) + *b = a; + } +}; + +template<typename T> +class AtomicOpMin : public TypedAtomicOpFunctor<T> +{ + public: + T a; + AtomicOpMin(T _a) : a(_a) {} + + void + execute(T *b) + { + if (a < *b) + *b = a; + } +}; + +#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) +#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) +#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) + +typedef enum +{ + VT_32, + VT_64, +} vgpr_type; + +typedef enum +{ + SEG_PRIVATE, + SEG_SPILL, + SEG_GLOBAL, + SEG_SHARED, + SEG_READONLY, + SEG_FLAT +} seg_type; + +class GPUDynInst : public GPUExecContext +{ + public: + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + uint64_t instSeqNum); + + void execute(); + int numSrcRegOperands(); + int numDstRegOperands(); + int getNumOperands(); + bool isVectorRegister(int operandIdx); + bool isScalarRegister(int operandIdx); + int getRegisterIndex(int operandIdx); + int getOperandSize(int operandIdx); + bool isDstOperand(int operandIdx); + bool isSrcOperand(int operandIdx); + bool isArgLoad(); + + const std::string &disassemble() const; + + uint64_t seqNum() const; + + Enums::OpType opType(); + Enums::StorageClassType executedAs(); + + // The address of the memory operation + Addr addr[VSZ]; + Addr pAddr; + + // The data to get written + uint8_t d_data[VSZ * 16]; + // Additional data (for atomics) + uint8_t a_data[VSZ * 8]; + // Additional data (for atomics) + uint8_t x_data[VSZ * 8]; + // The execution mask + VectorMask exec_mask; + + // The memory type (M_U32, M_S32, ...) + Enums::MemType m_type; + // The memory operation (MO_LD, MO_ST, ...) + Enums::MemOpType m_op; + Enums::GenericMemoryOrder memoryOrder; + + // Scope of the request + Enums::GenericMemoryScope scope; + // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) + seg_type s_type; + // The equivalency class + int equiv; + // The return VGPR type (VT_32 or VT_64) + vgpr_type v_type; + // Number of VGPR's accessed (1, 2, or 4) + int n_reg; + // The return VGPR index + int dst_reg; + // There can be max 4 dest regs> + int dst_reg_vec[4]; + // SIMD where the WF of the memory instruction has been mapped to + int simdId; + // unique id of the WF where the memory instruction belongs to + int wfDynId; + // The kernel id of the requesting wf + int kern_id; + // The CU id of the requesting wf + int cu_id; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + // execution pipeline id where the memory instruction has been scheduled + int pipeId; + // The execution time of this operation + Tick time; + // The latency of this operation + WaitClass latency; + // A list of bank conflicts for the 4 cycles. + uint32_t bc[4]; + + // A pointer to ROM + uint8_t *rom; + // The size of the READONLY segment + int sz_rom; + + // Initiate the specified memory operation, by creating a + // memory request and sending it off to the memory system. + void initiateAcc(GPUDynInstPtr gpuDynInst); + + void updateStats(); + + GPUStaticInst* staticInstruction() { return staticInst; } + + // Is the instruction a scalar or vector op? + bool scalarOp() const; + + /* + * Loads/stores/atomics may have acquire/release semantics associated + * withthem. Some protocols want to see the acquire/release as separate + * requests from the load/store/atomic. We implement that separation + * using continuations (i.e., a function pointer with an object associated + * with it). When, for example, the front-end generates a store with + * release semantics, we will first issue a normal store and set the + * continuation in the GPUDynInst to a function that generate a + * release request. That continuation will be called when the normal + * store completes (in ComputeUnit::DataPort::recvTimingResponse). The + * continuation will be called in the context of the same GPUDynInst + * that generated the initial store. + */ + std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; + + // when true, call execContinuation when response arrives + bool useContinuation; + + template<typename c0> AtomicOpFunctor* + makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + { + using namespace Enums; + + switch(op) { + case MO_AAND: + case MO_ANRAND: + return new AtomicOpAnd<c0>(*reg0); + case MO_AOR: + case MO_ANROR: + return new AtomicOpOr<c0>(*reg0); + case MO_AXOR: + case MO_ANRXOR: + return new AtomicOpXor<c0>(*reg0); + case MO_ACAS: + case MO_ANRCAS: + return new AtomicOpCAS<c0>(*reg0, *reg1, cu); + case MO_AEXCH: + case MO_ANREXCH: + return new AtomicOpExch<c0>(*reg0); + case MO_AADD: + case MO_ANRADD: + return new AtomicOpAdd<c0>(*reg0); + case MO_ASUB: + case MO_ANRSUB: + return new AtomicOpSub<c0>(*reg0); + case MO_AINC: + case MO_ANRINC: + return new AtomicOpInc<c0>(); + case MO_ADEC: + case MO_ANRDEC: + return new AtomicOpDec<c0>(); + case MO_AMAX: + case MO_ANRMAX: + return new AtomicOpMax<c0>(*reg0); + case MO_AMIN: + case MO_ANRMIN: + return new AtomicOpMin<c0>(*reg0); + default: + panic("Unrecognized atomic operation"); + } + } + + void + setRequestFlags(Request *req, bool setMemOrder=true) + { + // currently these are the easy scopes to deduce + switch (s_type) { + case SEG_PRIVATE: + req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); + break; + case SEG_SPILL: + req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); + break; + case SEG_GLOBAL: + req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); + break; + case SEG_READONLY: + req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); + break; + case SEG_SHARED: + req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); + break; + case SEG_FLAT: + // TODO: translate to correct scope + assert(false); + default: + panic("Bad segment type"); + break; + } + + switch (scope) { + case Enums::MEMORY_SCOPE_NONE: + case Enums::MEMORY_SCOPE_WORKITEM: + break; + case Enums::MEMORY_SCOPE_WAVEFRONT: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WAVEFRONT_SCOPE); + break; + case Enums::MEMORY_SCOPE_WORKGROUP: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::WORKGROUP_SCOPE); + break; + case Enums::MEMORY_SCOPE_DEVICE: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::DEVICE_SCOPE); + break; + case Enums::MEMORY_SCOPE_SYSTEM: + req->setMemSpaceConfigFlags(Request::SCOPE_VALID | + Request::SYSTEM_SCOPE); + break; + default: + panic("Bad scope type"); + break; + } + + if (setMemOrder) { + // set acquire and release flags + switch (memoryOrder){ + case Enums::MEMORY_ORDER_SC_ACQUIRE: + req->setFlags(Request::ACQUIRE); + break; + case Enums::MEMORY_ORDER_SC_RELEASE: + req->setFlags(Request::RELEASE); + break; + case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + req->setFlags(Request::ACQUIRE | Request::RELEASE); + break; + default: + break; + } + } + + // set atomic type + // currently, the instruction genenerator only produces atomic return + // but a magic instruction can produce atomic no return + if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || + m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || + m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || + m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || + m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || + m_op == Enums::MO_ACAS) { + req->setFlags(Request::ATOMIC_RETURN_OP); + } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || + m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || + m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || + m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || + m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || + m_op == Enums::MO_ANRCAS) { + req->setFlags(Request::ATOMIC_NO_RETURN_OP); + } + } + + // Map returned packets and the addresses they satisfy with which lane they + // were requested from + typedef std::unordered_map<Addr, std::vector<int>> StatusVector; + StatusVector memStatusVector; + + // Track the status of memory requests per lane, a bit per lane + VectorMask statusBitVector; + // for ld_v# or st_v# + std::vector<int> statusVector; + std::vector<int> tlbHitLevel; + + private: + GPUStaticInst *staticInst; + uint64_t _seqNum; +}; + +#endif // __GPU_DYN_INST_HH__ diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc new file mode 100644 index 000000000..4af69c41e --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.cc @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_exec_context.hh" + +GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf) + : cu(_cu), wf(_wf) +{ +} + +ComputeUnit* +GPUExecContext::computeUnit() +{ + return cu; +} + +Wavefront* +GPUExecContext::wavefront() +{ + return wf; +} diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh new file mode 100644 index 000000000..a3deb9b8f --- /dev/null +++ b/src/gpu-compute/gpu_exec_context.hh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_EXEC_CONTEXT_HH__ +#define __GPU_EXEC_CONTEXT_HH__ + +class ComputeUnit; +class Wavefront; + +class GPUExecContext +{ + public: + GPUExecContext(ComputeUnit *_cu, Wavefront *_wf); + Wavefront* wavefront(); + ComputeUnit* computeUnit(); + + protected: + ComputeUnit *cu; + Wavefront *wf; +}; + +#endif // __GPU_EXEC_CONTEXT_HH__ diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc new file mode 100644 index 000000000..bcb8a5f3d --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_static_inst.hh" + +GPUStaticInst::GPUStaticInst(const std::string &opcode) + : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0), _scalarOp(false) +{ +} diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh new file mode 100644 index 000000000..c1de28427 --- /dev/null +++ b/src/gpu-compute/gpu_static_inst.hh @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __GPU_STATIC_INST_HH__ +#define __GPU_STATIC_INST_HH__ + +/* + * @file gpu_static_inst.hh + * + * Defines the base class representing static instructions for the GPU. The + * instructions are "static" because they contain no dynamic instruction + * information. GPUStaticInst corresponds to the StaticInst class for the CPU + * models. + */ + +#include <cstdint> +#include <string> + +#include "enums/OpType.hh" +#include "enums/StorageClassType.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/misc.hh" + +class BaseOperand; +class BaseRegOperand; +class Wavefront; + +class GPUStaticInst +{ + public: + GPUStaticInst(const std::string &opcode); + + void instNum(int num) { _instNum = num; } + + int instNum() { return _instNum; } + + void ipdInstNum(int num) { _ipdInstNum = num; } + + int ipdInstNum() const { return _ipdInstNum; } + + virtual void execute(GPUDynInstPtr gpuDynInst) = 0; + virtual void generateDisassembly() = 0; + virtual const std::string &disassemble() = 0; + virtual int getNumOperands() = 0; + virtual bool isCondRegister(int operandIndex) = 0; + virtual bool isScalarRegister(int operandIndex) = 0; + virtual bool isVectorRegister(int operandIndex) = 0; + virtual bool isSrcOperand(int operandIndex) = 0; + virtual bool isDstOperand(int operandIndex) = 0; + virtual int getOperandSize(int operandIndex) = 0; + virtual int getRegisterIndex(int operandIndex) = 0; + virtual int numDstRegOperands() = 0; + virtual int numSrcRegOperands() = 0; + + /* + * Most instructions (including all HSAIL instructions) + * are vector ops, so _scalarOp will be false by default. + * Derived instruction objects that are scalar ops must + * set _scalarOp to true in their constructors. + */ + bool scalarOp() const { return _scalarOp; } + + virtual bool isLocalMem() const + { + fatal("calling isLocalMem() on non-memory instruction.\n"); + + return false; + } + + bool isArgLoad() { return false; } + virtual uint32_t instSize() = 0; + + // only used for memory instructions + virtual void + initiateAcc(GPUDynInstPtr gpuDynInst) + { + fatal("calling initiateAcc() on a non-memory instruction.\n"); + } + + virtual uint32_t getTargetPc() { return 0; } + + /** + * Query whether the instruction is an unconditional jump i.e., the jump + * is always executed because there is no condition to be evaluated. + * + * If the instruction is not of branch type, the result is always false. + * + * @return True if the instruction is an unconditional jump. + */ + virtual bool unconditionalJumpInstruction() { return false; } + + static uint64_t dynamic_id_count; + + Enums::OpType o_type; + // For flat memory accesses + Enums::StorageClassType executed_as; + + protected: + virtual void + execLdAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execSt(GPUDynInstPtr gpuDynInst) + { + fatal("calling execLdAcq() on a non-load instruction.\n"); + } + + virtual void + execAtomic(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomic() on a non-atomic instruction.\n"); + } + + virtual void + execAtomicAcq(GPUDynInstPtr gpuDynInst) + { + fatal("calling execAtomicAcq() on a non-atomic instruction.\n"); + } + + const std::string opcode; + std::string disassembly; + int _instNum; + /** + * Identifier of the immediate post-dominator instruction. + */ + int _ipdInstNum; + + bool _scalarOp; +}; + +#endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc new file mode 100644 index 000000000..de005fd04 --- /dev/null +++ b/src/gpu-compute/gpu_tlb.cc @@ -0,0 +1,1801 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/gpu_tlb.hh" + +#include <cmath> +#include <cstring> + +#include "arch/x86/faults.hh" +#include "arch/x86/insts/microldstop.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/misc.hh" +#include "arch/x86/x86_traits.hh" +#include "base/bitfield.hh" +#include "base/output.hh" +#include "base/trace.hh" +#include "cpu/base.hh" +#include "cpu/thread_context.hh" +#include "debug/GPUPrefetch.hh" +#include "debug/GPUTLB.hh" +#include "mem/packet_access.hh" +#include "mem/page_table.hh" +#include "mem/request.hh" +#include "sim/process.hh" + +namespace X86ISA +{ + + GpuTLB::GpuTLB(const Params *p) + : MemObject(p), configAddress(0), size(p->size), + cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this) + { + assoc = p->assoc; + assert(assoc <= size); + numSets = size/assoc; + allocationPolicy = p->allocationPolicy; + hasMemSidePort = false; + accessDistance = p->accessDistance; + clock = p->clk_domain->clockPeriod(); + + tlb = new GpuTlbEntry[size]; + std::memset(tlb, 0, sizeof(GpuTlbEntry) * size); + + freeList.resize(numSets); + entryList.resize(numSets); + + for (int set = 0; set < numSets; ++set) { + for (int way = 0; way < assoc; ++way) { + int x = set*assoc + way; + freeList[set].push_back(&tlb[x]); + } + } + + FA = (size == assoc); + + /** + * @warning: the set-associative version assumes you have a + * fixed page size of 4KB. + * If the page size is greather than 4KB (as defined in the + * TheISA::PageBytes), then there are various issues w/ the current + * implementation (you'd have the same 8KB page being replicated in + * different sets etc) + */ + setMask = numSets - 1; + + #if 0 + // GpuTLB doesn't yet support full system + walker = p->walker; + walker->setTLB(this); + #endif + + maxCoalescedReqs = p->maxOutstandingReqs; + + // Do not allow maxCoalescedReqs to be more than the TLB associativity + if (maxCoalescedReqs > assoc) { + maxCoalescedReqs = assoc; + cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc); + } + + outstandingReqs = 0; + hitLatency = p->hitLatency; + missLatency1 = p->missLatency1; + missLatency2 = p->missLatency2; + + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", + name(), i), this, i)); + } + } + + // fixme: this is never called? + GpuTLB::~GpuTLB() + { + // make sure all the hash-maps are empty + assert(translationReturnEvent.empty()); + + // delete the TLB + delete[] tlb; + } + + BaseSlavePort& + GpuTLB::getSlavePort(const std::string &if_name, PortID idx) + { + if (if_name == "slave") { + if (idx >= static_cast<PortID>(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } + } + + BaseMasterPort& + GpuTLB::getMasterPort(const std::string &if_name, PortID idx) + { + if (if_name == "master") { + if (idx >= static_cast<PortID>(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + hasMemSidePort = true; + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } + } + + GpuTlbEntry* + GpuTLB::insert(Addr vpn, GpuTlbEntry &entry) + { + GpuTlbEntry *newEntry = nullptr; + + /** + * vpn holds the virtual page address + * The least significant bits are simply masked + */ + int set = (vpn >> TheISA::PageShift) & setMask; + + if (!freeList[set].empty()) { + newEntry = freeList[set].front(); + freeList[set].pop_front(); + } else { + newEntry = entryList[set].back(); + entryList[set].pop_back(); + } + + *newEntry = entry; + newEntry->vaddr = vpn; + entryList[set].push_front(newEntry); + + return newEntry; + } + + GpuTLB::EntryList::iterator + GpuTLB::lookupIt(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + if (FA) { + assert(!set); + } + + auto entry = entryList[set].begin(); + for (; entry != entryList[set].end(); ++entry) { + int page_size = (*entry)->size(); + + if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) { + DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x " + "with size %#x.\n", va, (*entry)->vaddr, page_size); + + if (update_lru) { + entryList[set].push_front(*entry); + entryList[set].erase(entry); + entry = entryList[set].begin(); + } + + break; + } + } + + return entry; + } + + GpuTlbEntry* + GpuTLB::lookup(Addr va, bool update_lru) + { + int set = (va >> TheISA::PageShift) & setMask; + + auto entry = lookupIt(va, update_lru); + + if (entry == entryList[set].end()) + return nullptr; + else + return *entry; + } + + void + GpuTLB::invalidateAll() + { + DPRINTF(GPUTLB, "Invalidating all entries.\n"); + + for (int i = 0; i < numSets; ++i) { + while (!entryList[i].empty()) { + GpuTlbEntry *entry = entryList[i].front(); + entryList[i].pop_front(); + freeList[i].push_back(entry); + } + } + } + + void + GpuTLB::setConfigAddress(uint32_t addr) + { + configAddress = addr; + } + + void + GpuTLB::invalidateNonGlobal() + { + DPRINTF(GPUTLB, "Invalidating all non global entries.\n"); + + for (int i = 0; i < numSets; ++i) { + for (auto entryIt = entryList[i].begin(); + entryIt != entryList[i].end();) { + if (!(*entryIt)->global) { + freeList[i].push_back(*entryIt); + entryList[i].erase(entryIt++); + } else { + ++entryIt; + } + } + } + } + + void + GpuTLB::demapPage(Addr va, uint64_t asn) + { + + int set = (va >> TheISA::PageShift) & setMask; + auto entry = lookupIt(va, false); + + if (entry != entryList[set].end()) { + freeList[set].push_back(*entry); + entryList[set].erase(entry); + } + } + + Fault + GpuTLB::translateInt(RequestPtr req, ThreadContext *tc) + { + DPRINTF(GPUTLB, "Addresses references internal memory.\n"); + Addr vaddr = req->getVaddr(); + Addr prefix = (vaddr >> 3) & IntAddrPrefixMask; + + if (prefix == IntAddrPrefixCPUID) { + panic("CPUID memory space not yet implemented!\n"); + } else if (prefix == IntAddrPrefixMSR) { + vaddr = vaddr >> 3; + req->setFlags(Request::MMAPPED_IPR); + Addr regNum = 0; + + switch (vaddr & ~IntAddrPrefixMask) { + case 0x10: + regNum = MISCREG_TSC; + break; + case 0x1B: + regNum = MISCREG_APIC_BASE; + break; + case 0xFE: + regNum = MISCREG_MTRRCAP; + break; + case 0x174: + regNum = MISCREG_SYSENTER_CS; + break; + case 0x175: + regNum = MISCREG_SYSENTER_ESP; + break; + case 0x176: + regNum = MISCREG_SYSENTER_EIP; + break; + case 0x179: + regNum = MISCREG_MCG_CAP; + break; + case 0x17A: + regNum = MISCREG_MCG_STATUS; + break; + case 0x17B: + regNum = MISCREG_MCG_CTL; + break; + case 0x1D9: + regNum = MISCREG_DEBUG_CTL_MSR; + break; + case 0x1DB: + regNum = MISCREG_LAST_BRANCH_FROM_IP; + break; + case 0x1DC: + regNum = MISCREG_LAST_BRANCH_TO_IP; + break; + case 0x1DD: + regNum = MISCREG_LAST_EXCEPTION_FROM_IP; + break; + case 0x1DE: + regNum = MISCREG_LAST_EXCEPTION_TO_IP; + break; + case 0x200: + regNum = MISCREG_MTRR_PHYS_BASE_0; + break; + case 0x201: + regNum = MISCREG_MTRR_PHYS_MASK_0; + break; + case 0x202: + regNum = MISCREG_MTRR_PHYS_BASE_1; + break; + case 0x203: + regNum = MISCREG_MTRR_PHYS_MASK_1; + break; + case 0x204: + regNum = MISCREG_MTRR_PHYS_BASE_2; + break; + case 0x205: + regNum = MISCREG_MTRR_PHYS_MASK_2; + break; + case 0x206: + regNum = MISCREG_MTRR_PHYS_BASE_3; + break; + case 0x207: + regNum = MISCREG_MTRR_PHYS_MASK_3; + break; + case 0x208: + regNum = MISCREG_MTRR_PHYS_BASE_4; + break; + case 0x209: + regNum = MISCREG_MTRR_PHYS_MASK_4; + break; + case 0x20A: + regNum = MISCREG_MTRR_PHYS_BASE_5; + break; + case 0x20B: + regNum = MISCREG_MTRR_PHYS_MASK_5; + break; + case 0x20C: + regNum = MISCREG_MTRR_PHYS_BASE_6; + break; + case 0x20D: + regNum = MISCREG_MTRR_PHYS_MASK_6; + break; + case 0x20E: + regNum = MISCREG_MTRR_PHYS_BASE_7; + break; + case 0x20F: + regNum = MISCREG_MTRR_PHYS_MASK_7; + break; + case 0x250: + regNum = MISCREG_MTRR_FIX_64K_00000; + break; + case 0x258: + regNum = MISCREG_MTRR_FIX_16K_80000; + break; + case 0x259: + regNum = MISCREG_MTRR_FIX_16K_A0000; + break; + case 0x268: + regNum = MISCREG_MTRR_FIX_4K_C0000; + break; + case 0x269: + regNum = MISCREG_MTRR_FIX_4K_C8000; + break; + case 0x26A: + regNum = MISCREG_MTRR_FIX_4K_D0000; + break; + case 0x26B: + regNum = MISCREG_MTRR_FIX_4K_D8000; + break; + case 0x26C: + regNum = MISCREG_MTRR_FIX_4K_E0000; + break; + case 0x26D: + regNum = MISCREG_MTRR_FIX_4K_E8000; + break; + case 0x26E: + regNum = MISCREG_MTRR_FIX_4K_F0000; + break; + case 0x26F: + regNum = MISCREG_MTRR_FIX_4K_F8000; + break; + case 0x277: + regNum = MISCREG_PAT; + break; + case 0x2FF: + regNum = MISCREG_DEF_TYPE; + break; + case 0x400: + regNum = MISCREG_MC0_CTL; + break; + case 0x404: + regNum = MISCREG_MC1_CTL; + break; + case 0x408: + regNum = MISCREG_MC2_CTL; + break; + case 0x40C: + regNum = MISCREG_MC3_CTL; + break; + case 0x410: + regNum = MISCREG_MC4_CTL; + break; + case 0x414: + regNum = MISCREG_MC5_CTL; + break; + case 0x418: + regNum = MISCREG_MC6_CTL; + break; + case 0x41C: + regNum = MISCREG_MC7_CTL; + break; + case 0x401: + regNum = MISCREG_MC0_STATUS; + break; + case 0x405: + regNum = MISCREG_MC1_STATUS; + break; + case 0x409: + regNum = MISCREG_MC2_STATUS; + break; + case 0x40D: + regNum = MISCREG_MC3_STATUS; + break; + case 0x411: + regNum = MISCREG_MC4_STATUS; + break; + case 0x415: + regNum = MISCREG_MC5_STATUS; + break; + case 0x419: + regNum = MISCREG_MC6_STATUS; + break; + case 0x41D: + regNum = MISCREG_MC7_STATUS; + break; + case 0x402: + regNum = MISCREG_MC0_ADDR; + break; + case 0x406: + regNum = MISCREG_MC1_ADDR; + break; + case 0x40A: + regNum = MISCREG_MC2_ADDR; + break; + case 0x40E: + regNum = MISCREG_MC3_ADDR; + break; + case 0x412: + regNum = MISCREG_MC4_ADDR; + break; + case 0x416: + regNum = MISCREG_MC5_ADDR; + break; + case 0x41A: + regNum = MISCREG_MC6_ADDR; + break; + case 0x41E: + regNum = MISCREG_MC7_ADDR; + break; + case 0x403: + regNum = MISCREG_MC0_MISC; + break; + case 0x407: + regNum = MISCREG_MC1_MISC; + break; + case 0x40B: + regNum = MISCREG_MC2_MISC; + break; + case 0x40F: + regNum = MISCREG_MC3_MISC; + break; + case 0x413: + regNum = MISCREG_MC4_MISC; + break; + case 0x417: + regNum = MISCREG_MC5_MISC; + break; + case 0x41B: + regNum = MISCREG_MC6_MISC; + break; + case 0x41F: + regNum = MISCREG_MC7_MISC; + break; + case 0xC0000080: + regNum = MISCREG_EFER; + break; + case 0xC0000081: + regNum = MISCREG_STAR; + break; + case 0xC0000082: + regNum = MISCREG_LSTAR; + break; + case 0xC0000083: + regNum = MISCREG_CSTAR; + break; + case 0xC0000084: + regNum = MISCREG_SF_MASK; + break; + case 0xC0000100: + regNum = MISCREG_FS_BASE; + break; + case 0xC0000101: + regNum = MISCREG_GS_BASE; + break; + case 0xC0000102: + regNum = MISCREG_KERNEL_GS_BASE; + break; + case 0xC0000103: + regNum = MISCREG_TSC_AUX; + break; + case 0xC0010000: + regNum = MISCREG_PERF_EVT_SEL0; + break; + case 0xC0010001: + regNum = MISCREG_PERF_EVT_SEL1; + break; + case 0xC0010002: + regNum = MISCREG_PERF_EVT_SEL2; + break; + case 0xC0010003: + regNum = MISCREG_PERF_EVT_SEL3; + break; + case 0xC0010004: + regNum = MISCREG_PERF_EVT_CTR0; + break; + case 0xC0010005: + regNum = MISCREG_PERF_EVT_CTR1; + break; + case 0xC0010006: + regNum = MISCREG_PERF_EVT_CTR2; + break; + case 0xC0010007: + regNum = MISCREG_PERF_EVT_CTR3; + break; + case 0xC0010010: + regNum = MISCREG_SYSCFG; + break; + case 0xC0010016: + regNum = MISCREG_IORR_BASE0; + break; + case 0xC0010017: + regNum = MISCREG_IORR_BASE1; + break; + case 0xC0010018: + regNum = MISCREG_IORR_MASK0; + break; + case 0xC0010019: + regNum = MISCREG_IORR_MASK1; + break; + case 0xC001001A: + regNum = MISCREG_TOP_MEM; + break; + case 0xC001001D: + regNum = MISCREG_TOP_MEM2; + break; + case 0xC0010114: + regNum = MISCREG_VM_CR; + break; + case 0xC0010115: + regNum = MISCREG_IGNNE; + break; + case 0xC0010116: + regNum = MISCREG_SMM_CTL; + break; + case 0xC0010117: + regNum = MISCREG_VM_HSAVE_PA; + break; + default: + return std::make_shared<GeneralProtection>(0); + } + //The index is multiplied by the size of a MiscReg so that + //any memory dependence calculations will not see these as + //overlapping. + req->setPaddr(regNum * sizeof(MiscReg)); + return NoFault; + } else if (prefix == IntAddrPrefixIO) { + // TODO If CPL > IOPL or in virtual mode, check the I/O permission + // bitmap in the TSS. + + Addr IOPort = vaddr & ~IntAddrPrefixMask; + // Make sure the address fits in the expected 16 bit IO address + // space. + assert(!(IOPort & ~0xFFFF)); + + if (IOPort == 0xCF8 && req->getSize() == 4) { + req->setFlags(Request::MMAPPED_IPR); + req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg)); + } else if ((IOPort & ~mask(2)) == 0xCFC) { + req->setFlags(Request::UNCACHEABLE); + + Addr configAddress = + tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS); + + if (bits(configAddress, 31, 31)) { + req->setPaddr(PhysAddrPrefixPciConfig | + mbits(configAddress, 30, 2) | + (IOPort & mask(2))); + } else { + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + } else { + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(PhysAddrPrefixIO | IOPort); + } + return NoFault; + } else { + panic("Access to unrecognized internal address space %#x.\n", + prefix); + } + } + + /** + * TLB_lookup will only perform a TLB lookup returning true on a TLB hit + * and false on a TLB miss. + * Many of the checks about different modes have been converted to + * assertions, since these parts of the code are not really used. + * On a hit it will update the LRU stack. + */ + bool + GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats) + { + bool tlb_hit = false; + #ifndef NDEBUG + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + #endif + + assert(seg != SEGMENT_REG_MS); + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr); + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // make sure we are in 64-bit mode + assert(m5Reg.mode == LongMode); + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + //update LRU stack on a hit + GpuTlbEntry *entry = lookup(vaddr, true); + + if (entry) + tlb_hit = true; + + if (!update_stats) { + // functional tlb access for memory initialization + // i.e., memory seeding or instr. seeding -> don't update + // TLB and stats + return tlb_hit; + } + + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + } else { + localNumTLBHits++; + } + } + } + + return tlb_hit; + } + + Fault + GpuTLB::translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + bool &delayedResponse, bool timing, int &latency) + { + uint32_t flags = req->getFlags(); + int seg = flags & SegmentFlagMask; + bool storeCheck = flags & (StoreCheck << FlagShift); + + // If this is true, we're dealing with a request + // to a non-memory address space. + if (seg == SEGMENT_REG_MS) { + return translateInt(req, tc); + } + + delayedResponse = false; + Addr vaddr = req->getVaddr(); + DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr); + + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + + // If protected mode has been enabled... + if (m5Reg.prot) { + DPRINTF(GPUTLB, "In protected mode.\n"); + // If we're not in 64-bit mode, do protection/limit checks + if (m5Reg.mode != LongMode) { + DPRINTF(GPUTLB, "Not in long mode. Checking segment " + "protection.\n"); + + // Check for a null segment selector. + if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR || + seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS) + && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) { + return std::make_shared<GeneralProtection>(0); + } + + bool expandDown = false; + SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg)); + + if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) { + if (!attr.writable && (mode == BaseTLB::Write || + storeCheck)) + return std::make_shared<GeneralProtection>(0); + + if (!attr.readable && mode == BaseTLB::Read) + return std::make_shared<GeneralProtection>(0); + + expandDown = attr.expandDown; + + } + + Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg)); + Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg)); + // This assumes we're not in 64 bit mode. If we were, the + // default address size is 64 bits, overridable to 32. + int size = 32; + bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift)); + SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR); + + if ((csAttr.defaultSize && sizeOverride) || + (!csAttr.defaultSize && !sizeOverride)) { + size = 16; + } + + Addr offset = bits(vaddr - base, size - 1, 0); + Addr endOffset = offset + req->getSize() - 1; + + if (expandDown) { + DPRINTF(GPUTLB, "Checking an expand down segment.\n"); + warn_once("Expand down segments are untested.\n"); + + if (offset <= limit || endOffset <= limit) + return std::make_shared<GeneralProtection>(0); + } else { + if (offset > limit || endOffset > limit) + return std::make_shared<GeneralProtection>(0); + } + } + + // If paging is enabled, do the translation. + if (m5Reg.paging) { + DPRINTF(GPUTLB, "Paging enabled.\n"); + // The vaddr already has the segment base applied. + GpuTlbEntry *entry = lookup(vaddr); + localNumTLBAccesses++; + + if (!entry) { + localNumTLBMisses++; + if (timing) { + latency = missLatency1; + } + + if (FullSystem) { + fatal("GpuTLB doesn't support full-system mode\n"); + } else { + DPRINTF(GPUTLB, "Handling a TLB miss for address %#x " + "at pc %#x.\n", vaddr, tc->instAddr()); + + Process *p = tc->getProcessPtr(); + GpuTlbEntry newEntry; + bool success = p->pTable->lookup(vaddr, newEntry); + + if (!success && mode != BaseTLB::Execute) { + // penalize a "page fault" more + if (timing) { + latency += missLatency2; + } + + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!success) { + return std::make_shared<PageFault>(vaddr, true, + mode, true, + false); + } else { + newEntry.valid = success; + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", + alignedVaddr, newEntry.pageStart()); + + entry = insert(alignedVaddr, newEntry); + } + + DPRINTF(GPUTLB, "Miss was serviced.\n"); + } + } else { + localNumTLBHits++; + + if (timing) { + latency = hitLatency; + } + } + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && + !(flags & (CPL0FlagBit << FlagShift))); + + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + bool badWrite = (!entry->writable && (inUser || cr0.wp)); + + if ((inUser && !entry->user) || (mode == BaseTLB::Write && + badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + return std::make_shared<PageFault>(vaddr, true, mode, + inUser, false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + return std::make_shared<PageFault>(vaddr, true, + BaseTLB::Write, + inUser, false); + } + + + DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection " + "checks.\n", entry->paddr); + + int page_size = entry->size(); + Addr paddr = entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + req->setPaddr(paddr); + + if (entry->uncacheable) + req->setFlags(Request::UNCACHEABLE); + } else { + //Use the address which already has segmentation applied. + DPRINTF(GPUTLB, "Paging disabled.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + } else { + // Real mode + DPRINTF(GPUTLB, "In real mode.\n"); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr); + req->setPaddr(vaddr); + } + + // Check for an access to the local APIC + if (FullSystem) { + LocalApicBase localApicBase = + tc->readMiscRegNoEffect(MISCREG_APIC_BASE); + + Addr baseAddr = localApicBase.base * PageBytes; + Addr paddr = req->getPaddr(); + + if (baseAddr <= paddr && baseAddr + PageBytes > paddr) { + // Force the access to be uncacheable. + req->setFlags(Request::UNCACHEABLE); + req->setPaddr(x86LocalAPICAddress(tc->contextId(), + paddr - baseAddr)); + } + } + + return NoFault; + }; + + Fault + GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency) + { + bool delayedResponse; + + return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false, + latency); + } + + void + GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, int &latency) + { + bool delayedResponse; + assert(translation); + + Fault fault = GpuTLB::translate(req, tc, translation, mode, + delayedResponse, true, latency); + + if (!delayedResponse) + translation->finish(fault, req, tc, mode); + } + + Walker* + GpuTLB::getWalker() + { + return walker; + } + + + void + GpuTLB::serialize(CheckpointOut &cp) const + { + } + + void + GpuTLB::unserialize(CheckpointIn &cp) + { + } + + void + GpuTLB::regStats() + { + localNumTLBAccesses + .name(name() + ".local_TLB_accesses") + .desc("Number of TLB accesses") + ; + + localNumTLBHits + .name(name() + ".local_TLB_hits") + .desc("Number of TLB hits") + ; + + localNumTLBMisses + .name(name() + ".local_TLB_misses") + .desc("Number of TLB misses") + ; + + localTLBMissRate + .name(name() + ".local_TLB_miss_rate") + .desc("TLB miss rate") + ; + + accessCycles + .name(name() + ".access_cycles") + .desc("Cycles spent accessing this TLB level") + ; + + pageTableCycles + .name(name() + ".page_table_cycles") + .desc("Cycles spent accessing the page table") + ; + + localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses; + + numUniquePages + .name(name() + ".unique_pages") + .desc("Number of unique pages touched") + ; + + localCycles + .name(name() + ".local_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over incoming coalesced reqs") + ; + + localLatency = localCycles / localNumTLBAccesses; + + globalNumTLBAccesses + .name(name() + ".global_TLB_accesses") + .desc("Number of TLB accesses") + ; + + globalNumTLBHits + .name(name() + ".global_TLB_hits") + .desc("Number of TLB hits") + ; + + globalNumTLBMisses + .name(name() + ".global_TLB_misses") + .desc("Number of TLB misses") + ; + + globalTLBMissRate + .name(name() + ".global_TLB_miss_rate") + .desc("TLB miss rate") + ; + + globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses; + + avgReuseDistance + .name(name() + ".avg_reuse_distance") + .desc("avg. reuse distance over all pages (in ticks)") + ; + + } + + /** + * Do the TLB lookup for this coalesced request and schedule + * another event <TLB access latency> cycles later. + */ + + void + GpuTLB::issueTLBLookup(PacketPtr pkt) + { + assert(pkt); + assert(pkt->senderState); + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + TranslationState *sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + ThreadContext * tmp_tc = sender_state->tc; + + DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n", + virt_page_addr); + + int req_cnt = sender_state->reqCnt.back(); + + if (update_stats) { + accessCycles -= (curTick() * req_cnt); + localCycles -= curTick(); + updatePageFootprint(virt_page_addr); + globalNumTLBAccesses += req_cnt; + } + + tlbOutcome lookup_outcome = TLB_MISS; + RequestPtr tmp_req = pkt->req; + + // Access the TLB and figure out if it's a hit or a miss. + bool success = tlbLookup(tmp_req, tmp_tc, update_stats); + + if (success) { + lookup_outcome = TLB_HIT; + // Put the entry in SenderState + GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false); + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + + if (update_stats) { + // the reqCnt has an entry per level, so its size tells us + // which level we are in + sender_state->hitLevel = sender_state->reqCnt.size(); + globalNumTLBHits += req_cnt; + } + } else { + if (update_stats) + globalNumTLBMisses += req_cnt; + } + + /* + * We now know the TLB lookup outcome (if it's a hit or a miss), as well + * as the TLB access latency. + * + * We create and schedule a new TLBEvent which will help us take the + * appropriate actions (e.g., update TLB on a hit, send request to lower + * level TLB on a miss, or start a page walk if this was the last-level + * TLB) + */ + TLBEvent *tlb_event = + new TLBEvent(this, virt_page_addr, lookup_outcome, pkt); + + if (translationReturnEvent.count(virt_page_addr)) { + panic("Virtual Page Address %#x already has a return event\n", + virt_page_addr); + } + + translationReturnEvent[virt_page_addr] = tlb_event; + assert(tlb_event); + + DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n", + curTick() + this->ticks(hitLatency)); + + schedule(tlb_event, curTick() + this->ticks(hitLatency)); + } + + GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome, + PacketPtr _pkt) + : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr), + outcome(tlb_outcome), pkt(_pkt) + { + } + + /** + * Do Paging protection checks. If we encounter a page fault, then + * an assertion is fired. + */ + void + GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry * tlb_entry, Mode mode) + { + HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & (StoreCheck << FlagShift); + + // Do paging protection checks. + bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift))); + CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); + + bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp)); + + if ((inUser && !tlb_entry->user) || + (mode == BaseTLB::Write && badWrite)) { + // The page must have been present to get into the TLB in + // the first place. We'll assume the reserved bits are + // fine even though we're not checking them. + assert(false); + } + + if (storeCheck && badWrite) { + // This would fault if this were a write, so return a page + // fault that reflects that happening. + assert(false); + } + } + + /** + * handleTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * The latter calls handelHit with TLB miss as tlbOutcome. + */ + void + GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome, + PacketPtr pkt) + { + + assert(pkt); + Addr vaddr = pkt->req->getVaddr(); + + TranslationState *sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr); + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n", + vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + /** + * At this point the packet carries an up-to-date tlbEntry pointer + * in its senderState. + * Next step is to do the paging protection checks. + */ + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + pagingProtectionChecks(tc, pkt, local_entry, mode); + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + // Since this packet will be sent through the cpu side slave port, + // it must be converted to a response pkt if it is not one already + if (pkt->isRequest()) { + pkt->makeTimingResponse(); + } + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) { + pkt->req->setFlags(Request::UNCACHEABLE); + } + + //send packet back to coalescer + cpuSidePort[0]->sendTimingResp(pkt); + //schedule cleanup event + cleanupQueue.push(virt_page_addr); + + // schedule this only once per cycle. + // The check is required because we might have multiple translations + // returning the same cycle + // this is a maximum priority event and must be on the same cycle + // as the cleanup event in TLBCoalescer to avoid a race with + // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); + } + + /** + * Here we take the appropriate actions based on the result of the + * TLB lookup. + */ + void + GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt) + { + DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr); + + assert(translationReturnEvent[virtPageAddr]); + assert(pkt); + + TranslationState *tmp_sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + int req_cnt = tmp_sender_state->reqCnt.back(); + bool update_stats = !tmp_sender_state->prefetch; + + + if (outcome == TLB_HIT) { + handleTranslationReturn(virtPageAddr, TLB_HIT, pkt); + + if (update_stats) { + accessCycles += (req_cnt * curTick()); + localCycles += curTick(); + } + + } else if (outcome == TLB_MISS) { + + DPRINTF(GPUTLB, "This is a TLB miss\n"); + if (update_stats) { + accessCycles += (req_cnt*curTick()); + localCycles += curTick(); + } + + if (hasMemSidePort) { + // the one cyle added here represent the delay from when we get + // the reply back till when we propagate it to the coalescer + // above. + if (update_stats) { + accessCycles += (req_cnt * 1); + localCycles += 1; + } + + /** + * There is a TLB below. Send the coalesced request. + * We actually send the very first packet of all the + * pending packets for this virtual page address. + */ + if (!memSidePort[0]->sendTimingReq(pkt)) { + DPRINTF(GPUTLB, "Failed sending translation request to " + "lower level TLB for addr %#x\n", virtPageAddr); + + memSidePort[0]->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "Sent translation request to lower level " + "TLB for addr %#x\n", virtPageAddr); + } + } else { + //this is the last level TLB. Start a page walk + DPRINTF(GPUTLB, "Last level TLB - start a page walk for " + "addr %#x\n", virtPageAddr); + + if (update_stats) + pageTableCycles -= (req_cnt*curTick()); + + TLBEvent *tlb_event = translationReturnEvent[virtPageAddr]; + assert(tlb_event); + tlb_event->updateOutcome(PAGE_WALK); + schedule(tlb_event, curTick() + ticks(missLatency2)); + } + } else if (outcome == PAGE_WALK) { + if (update_stats) + pageTableCycles += (req_cnt*curTick()); + + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virtPageAddr); + + TranslationState *sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + Process *p = sender_state->tc->getProcessPtr(); + TlbEntry newEntry; + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virtPageAddr); + #endif + bool success; + success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) { + success = p->pTable->lookup(vaddr, newEntry); + } + } + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = + new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success); + + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else if (outcome == MISS_RETURN) { + /** we add an extra cycle in the return path of the translation + * requests in between the various TLB levels. + */ + handleTranslationReturn(virtPageAddr, TLB_MISS, pkt); + } else { + assert(false); + } + } + + void + GpuTLB::TLBEvent::process() + { + tlb->translationReturn(virtPageAddr, outcome, pkt); + } + + const char* + GpuTLB::TLBEvent::description() const + { + return "trigger translationDoneEvent"; + } + + void + GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome) + { + outcome = _outcome; + } + + Addr + GpuTLB::TLBEvent::getTLBEventVaddr() + { + return virtPageAddr; + } + + /* + * recvTiming receives a coalesced timing request from a TLBCoalescer + * and it calls issueTLBLookup() + * It only rejects the packet if we have exceeded the max + * outstanding number of requests for the TLB + */ + bool + GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt) + { + if (tlb->outstandingReqs < tlb->maxCoalescedReqs) { + tlb->issueTLBLookup(pkt); + // update number of outstanding translation requests + tlb->outstandingReqs++; + return true; + } else { + DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n", + tlb->outstandingReqs); + return false; + } + } + + /** + * handleFuncTranslationReturn is called on a TLB hit, + * when a TLB miss returns or when a page fault returns. + * It updates LRU, inserts the TLB entry on a miss + * depending on the allocation policy and does the required + * protection checks. It does NOT create a new packet to + * update the packet's addr; this is done in hsail-gpu code. + */ + void + GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome) + { + TranslationState *sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + Mode mode = sender_state->tlbMode; + Addr vaddr = pkt->req->getVaddr(); + + GpuTlbEntry *local_entry, *new_entry; + + if (tlb_outcome == TLB_HIT) { + DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr " + "%#x\n", vaddr); + + local_entry = sender_state->tlbEntry; + } else { + DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr " + "%#x\n", vaddr); + + // We are returning either from a page walk or from a hit at a lower + // TLB level. The senderState should be "carrying" a pointer to the + // correct TLBEntry. + new_entry = sender_state->tlbEntry; + assert(new_entry); + local_entry = new_entry; + + if (allocationPolicy) { + Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes); + + DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n", + virt_page_addr); + + local_entry = insert(virt_page_addr, *new_entry); + } + + assert(local_entry); + } + + DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks " + "while paddr was %#x.\n", local_entry->vaddr, + local_entry->paddr); + + // Do paging checks if it's a normal functional access. If it's for a + // prefetch, then sometimes you can try to prefetch something that won't + // pass protection. We don't actually want to fault becuase there is no + // demand access to deem this a violation. Just put it in the TLB and + // it will fault if indeed a future demand access touches it in + // violation. + if (!sender_state->prefetch && sender_state->tlbEntry->valid) + pagingProtectionChecks(tc, pkt, local_entry, mode); + + int page_size = local_entry->size(); + Addr paddr = local_entry->paddr | (vaddr & (page_size - 1)); + DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr); + + pkt->req->setPaddr(paddr); + + if (local_entry->uncacheable) + pkt->req->setFlags(Request::UNCACHEABLE); + } + + // This is used for atomic translations. Need to + // make it all happen during the same cycle. + void + GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt) + { + TranslationState *sender_state = + safe_cast<TranslationState*>(pkt->senderState); + + ThreadContext *tc = sender_state->tc; + bool update_stats = !sender_state->prefetch; + + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + if (update_stats) + tlb->updatePageFootprint(virt_page_addr); + + // do the TLB lookup without updating the stats + bool success = tlb->tlbLookup(pkt->req, tc, update_stats); + tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS; + + // functional mode means no coalescing + // global metrics are the same as the local metrics + if (update_stats) { + tlb->globalNumTLBAccesses++; + + if (success) { + sender_state->hitLevel = sender_state->reqCnt.size(); + tlb->globalNumTLBHits++; + } + } + + if (!success) { + if (update_stats) + tlb->globalNumTLBMisses++; + if (tlb->hasMemSidePort) { + // there is a TLB below -> propagate down the TLB hierarchy + tlb->memSidePort[0]->sendFunctional(pkt); + // If no valid translation from a prefetch, then just return + if (sender_state->prefetch && !pkt->req->hasPaddr()) + return; + } else { + // Need to access the page table and update the TLB + DPRINTF(GPUTLB, "Doing a page walk for address %#x\n", + virt_page_addr); + + Process *p = tc->getProcessPtr(); + TlbEntry newEntry; + + Addr vaddr = pkt->req->getVaddr(); + #ifndef NDEBUG + Addr alignedVaddr = p->pTable->pageAlign(vaddr); + assert(alignedVaddr == virt_page_addr); + #endif + + bool success = p->pTable->lookup(vaddr, newEntry); + if (!success && sender_state->tlbMode != BaseTLB::Execute) { + if (p->fixupStackFault(vaddr)) + success = p->pTable->lookup(vaddr, newEntry); + } + + if (!sender_state->prefetch) { + // no PageFaults are permitted after + // the second page table lookup + assert(success); + + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr, + newEntry.paddr, + success); + } else { + // If this was a prefetch, then do the normal thing if it + // was a successful translation. Otherwise, send an empty + // TLB entry back so that it can be figured out as empty and + // handled accordingly. + if (success) { + DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, + newEntry.pageStart()); + + sender_state->tlbEntry = new GpuTlbEntry(0, + newEntry.vaddr, + newEntry.paddr, + success); + } else { + DPRINTF(GPUPrefetch, "Prefetch failed %#x\n", + alignedVaddr); + + sender_state->tlbEntry = new GpuTlbEntry(); + + return; + } + } + } + } else { + DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n", + tlb->lookup(pkt->req->getVaddr())); + + GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(), + update_stats); + + assert(entry); + + sender_state->tlbEntry = + new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid); + } + // This is the function that would populate pkt->req with the paddr of + // the translation. But if no translation happens (i.e Prefetch fails) + // then the early returns in the above code wiill keep this function + // from executing. + tlb->handleFuncTranslationReturn(pkt, tlb_outcome); + } + + void + GpuTLB::CpuSidePort::recvReqRetry() + { + // The CPUSidePort never sends anything but replies. No retries + // expected. + assert(false); + } + + AddrRangeList + GpuTLB::CpuSidePort::getAddrRanges() const + { + // currently not checked by the master + AddrRangeList ranges; + + return ranges; + } + + /** + * MemSidePort receives the packet back. + * We need to call the handleTranslationReturn + * and propagate up the hierarchy. + */ + bool + GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt) + { + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), + TheISA::PageBytes); + + DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n", + virt_page_addr); + + TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr]; + assert(tlb_event); + assert(virt_page_addr == tlb_event->getTLBEventVaddr()); + + tlb_event->updateOutcome(MISS_RETURN); + tlb->schedule(tlb_event, curTick()+tlb->ticks(1)); + + return true; + } + + void + GpuTLB::MemSidePort::recvReqRetry() + { + // No retries should reach the TLB. The retries + // should only reach the TLBCoalescer. + assert(false); + } + + void + GpuTLB::cleanup() + { + while (!cleanupQueue.empty()) { + Addr cleanup_addr = cleanupQueue.front(); + cleanupQueue.pop(); + + // delete TLBEvent + TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr]; + delete old_tlb_event; + translationReturnEvent.erase(cleanup_addr); + + // update number of outstanding requests + outstandingReqs--; + } + + /** the higher level coalescer should retry if it has + * any pending requests. + */ + for (int i = 0; i < cpuSidePort.size(); ++i) { + cpuSidePort[i]->sendRetryReq(); + } + } + + void + GpuTLB::updatePageFootprint(Addr virt_page_addr) + { + + std::pair<AccessPatternTable::iterator, bool> ret; + + AccessInfo tmp_access_info; + tmp_access_info.lastTimeAccessed = 0; + tmp_access_info.accessesPerPage = 0; + tmp_access_info.totalReuseDistance = 0; + tmp_access_info.sumDistance = 0; + tmp_access_info.meanDistance = 0; + + ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr, + tmp_access_info)); + + bool first_page_access = ret.second; + + if (first_page_access) { + numUniquePages++; + } else { + int accessed_before; + accessed_before = curTick() - ret.first->second.lastTimeAccessed; + ret.first->second.totalReuseDistance += accessed_before; + } + + ret.first->second.accessesPerPage++; + ret.first->second.lastTimeAccessed = curTick(); + + if (accessDistance) { + ret.first->second.localTLBAccesses + .push_back(localNumTLBAccesses.value()); + } + } + + void + GpuTLB::exitCallback() + { + std::ostream *page_stat_file = nullptr; + + if (accessDistance) { + + // print per page statistics to a separate file (.csv format) + // simout is the gem5 output directory (default is m5out or the one + // specified with -d + page_stat_file = simout.create(name().c_str()); + + // print header + *page_stat_file << "page,max_access_distance,mean_access_distance, " + << "stddev_distance" << std::endl; + } + + // update avg. reuse distance footprint + AccessPatternTable::iterator iter, iter_begin, iter_end; + unsigned int sum_avg_reuse_distance_per_page = 0; + + // iterate through all pages seen by this TLB + for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) { + sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance / + iter->second.accessesPerPage; + + if (accessDistance) { + unsigned int tmp = iter->second.localTLBAccesses[0]; + unsigned int prev = tmp; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + if (i) { + tmp = prev + 1; + } + + prev = iter->second.localTLBAccesses[i]; + // update the localTLBAccesses value + // with the actual differece + iter->second.localTLBAccesses[i] -= tmp; + // compute the sum of AccessDistance per page + // used later for mean + iter->second.sumDistance += + iter->second.localTLBAccesses[i]; + } + + iter->second.meanDistance = + iter->second.sumDistance / iter->second.accessesPerPage; + + // compute std_dev and max (we need a second round because we + // need to know the mean value + unsigned int max_distance = 0; + unsigned int stddev_distance = 0; + + for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + unsigned int tmp_access_distance = + iter->second.localTLBAccesses[i]; + + if (tmp_access_distance > max_distance) { + max_distance = tmp_access_distance; + } + + unsigned int diff = + tmp_access_distance - iter->second.meanDistance; + stddev_distance += pow(diff, 2); + + } + + stddev_distance = + sqrt(stddev_distance/iter->second.accessesPerPage); + + if (page_stat_file) { + *page_stat_file << std::hex << iter->first << ","; + *page_stat_file << std::dec << max_distance << ","; + *page_stat_file << std::dec << iter->second.meanDistance + << ","; + *page_stat_file << std::dec << stddev_distance; + *page_stat_file << std::endl; + } + + // erase the localTLBAccesses array + iter->second.localTLBAccesses.clear(); + } + } + + if (!TLBFootprint.empty()) { + avgReuseDistance = + sum_avg_reuse_distance_per_page / TLBFootprint.size(); + } + + //clear the TLBFootprint map + TLBFootprint.clear(); + } +} // namespace X86ISA + +X86ISA::GpuTLB* +X86GPUTLBParams::create() +{ + return new X86ISA::GpuTLB(this); +} + diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh new file mode 100644 index 000000000..3549c598b --- /dev/null +++ b/src/gpu-compute/gpu_tlb.hh @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __GPU_TLB_HH__ +#define __GPU_TLB_HH__ + +#include <fstream> +#include <list> +#include <queue> +#include <string> +#include <vector> + +#include "arch/generic/tlb.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/segment.hh" +#include "base/callback.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/compute_unit.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/X86GPUTLB.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +namespace X86ISA +{ + class GpuTlbEntry : public TlbEntry + { + public: + GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid) + : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { } + + GpuTlbEntry() : TlbEntry() { } + + bool valid; + }; + + class GpuTLB : public MemObject + { + protected: + friend class Walker; + + typedef std::list<GpuTlbEntry*> EntryList; + + uint32_t configAddress; + + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the python + // config files. + int clock; + + public: + // clock related functions ; maps to-and-from Simulation ticks and + // object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick + ticks(int numCycles) const + { + return (Tick)clock * numCycles; + } + + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + typedef X86GPUTLBParams Params; + GpuTLB(const Params *p); + ~GpuTLB(); + + typedef enum BaseTLB::Mode Mode; + + class Translation + { + public: + virtual ~Translation() { } + + /** + * Signal that the translation has been delayed due to a hw page + * table walk. + */ + virtual void markDelayed() = 0; + + /** + * The memory for this object may be dynamically allocated, and it + * may be responsible for cleaning itslef up which will happen in + * this function. Once it's called the object is no longer valid. + */ + virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, + Mode mode) = 0; + }; + + void dumpAll(); + GpuTlbEntry *lookup(Addr va, bool update_lru=true); + void setConfigAddress(uint32_t addr); + + protected: + EntryList::iterator lookupIt(Addr va, bool update_lru=true); + Walker *walker; + + public: + Walker *getWalker(); + void invalidateAll(); + void invalidateNonGlobal(); + void demapPage(Addr va, uint64_t asn); + + protected: + int size; + int assoc; + int numSets; + + /** + * true if this is a fully-associative TLB + */ + bool FA; + Addr setMask; + + /** + * Allocation Policy: true if we always allocate on a hit, false + * otherwise. Default is true. + */ + bool allocationPolicy; + + /** + * if true, then this is not the last level TLB + */ + bool hasMemSidePort; + + /** + * Print out accessDistance stats. One stat file + * per TLB. + */ + bool accessDistance; + + GpuTlbEntry *tlb; + + /* + * It's a per-set list. As long as we have not reached + * the full capacity of the given set, grab an entry from + * the freeList. + */ + std::vector<EntryList> freeList; + + /** + * An entryList per set is the equivalent of an LRU stack; + * it's used to guide replacement decisions. The head of the list + * contains the MRU TLB entry of the given set. If the freeList + * for this set is empty, the last element of the list + * is evicted (i.e., dropped on the floor). + */ + std::vector<EntryList> entryList; + + Fault translateInt(RequestPtr req, ThreadContext *tc); + + Fault translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, bool &delayedResponse, + bool timing, int &latency); + + public: + // latencies for a TLB hit, miss and page fault + int hitLatency; + int missLatency1; + int missLatency2; + + // local_stats are as seen from the TLB + // without taking into account coalescing + Stats::Scalar localNumTLBAccesses; + Stats::Scalar localNumTLBHits; + Stats::Scalar localNumTLBMisses; + Stats::Formula localTLBMissRate; + + // global_stats are as seen from the + // CU's perspective taking into account + // all coalesced requests. + Stats::Scalar globalNumTLBAccesses; + Stats::Scalar globalNumTLBHits; + Stats::Scalar globalNumTLBMisses; + Stats::Formula globalTLBMissRate; + + // from the CU perspective (global) + Stats::Scalar accessCycles; + // from the CU perspective (global) + Stats::Scalar pageTableCycles; + Stats::Scalar numUniquePages; + // from the perspective of this TLB + Stats::Scalar localCycles; + // from the perspective of this TLB + Stats::Formula localLatency; + // I take the avg. per page and then + // the avg. over all pages. + Stats::Scalar avgReuseDistance; + + void regStats(); + void updatePageFootprint(Addr virt_page_addr); + void printAccessPattern(); + + + Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency); + + void translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + int &latency); + + Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); + Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); + + GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry); + + // Checkpointing + virtual void serialize(CheckpointOut& cp) const; + virtual void unserialize(CheckpointIn& cp); + void issueTranslation(); + enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; + bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); + + void handleTranslationReturn(Addr addr, tlbOutcome outcome, + PacketPtr pkt); + + void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); + + void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry *tlb_entry, Mode mode); + + void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry, + Addr phys_page_addr); + + void issueTLBLookup(PacketPtr pkt); + + // CpuSidePort is the TLB Port closer to the CPU/CU side + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + virtual void recvRespRetry() { assert(false); } + virtual AddrRangeList getAddrRanges() const; + }; + + /** + * MemSidePort is the TLB Port closer to the memory side + * If this is a last level TLB then this port will not be connected. + * + * Future action item: if we ever do real page walks, then this port + * should be connected to a RubyPort. + */ + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + std::deque<PacketPtr> retries; + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + // TLB ports on the cpu Side + std::vector<CpuSidePort*> cpuSidePort; + // TLB ports on the memory side + std::vector<MemSidePort*> memSidePort; + + BaseMasterPort &getMasterPort(const std::string &if_name, + PortID idx=InvalidPortID); + + BaseSlavePort &getSlavePort(const std::string &if_name, + PortID idx=InvalidPortID); + + /** + * TLB TranslationState: this currently is a somewhat bastardization of + * the usage of SenderState, whereby the receiver of a packet is not + * usually supposed to need to look at the contents of the senderState, + * you're really only supposed to look at what you pushed on, pop it + * off, and send it back. + * + * However, since there is state that we want to pass to the TLBs using + * the send/recv Timing/Functional/etc. APIs, which don't allow for new + * arguments, we need a common TLB senderState to pass between TLBs, + * both "forwards" and "backwards." + * + * So, basically, the rule is that any packet received by a TLB port + * (cpuside OR memside) must be safely castable to a TranslationState. + */ + + struct TranslationState : public Packet::SenderState + { + // TLB mode, read or write + Mode tlbMode; + // Thread context associated with this req + ThreadContext *tc; + + /* + * TLB entry to be populated and passed back and filled in + * previous TLBs. Equivalent to the data cache concept of + * "data return." + */ + GpuTlbEntry *tlbEntry; + // Is this a TLB prefetch request? + bool prefetch; + // When was the req for this translation issued + uint64_t issueTime; + // Remember where this came from + std::vector<SlavePort*>ports; + + // keep track of #uncoalesced reqs per packet per TLB level; + // reqCnt per level >= reqCnt higher level + std::vector<int> reqCnt; + // TLB level this packet hit in; 0 if it hit in the page table + int hitLevel; + Packet::SenderState *saved; + + TranslationState(Mode tlb_mode, ThreadContext *_tc, + bool _prefetch=false, + Packet::SenderState *_saved=nullptr) + : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), + prefetch(_prefetch), issueTime(0), + hitLevel(0),saved(_saved) { } + }; + + // maximum number of permitted coalesced requests per cycle + int maxCoalescedReqs; + + // Current number of outstandings coalesced requests. + // Should be <= maxCoalescedReqs + int outstandingReqs; + + /** + * A TLBEvent is scheduled after the TLB lookup and helps us take the + * appropriate actions: + * (e.g., update TLB on a hit, + * send request to lower level TLB on a miss, + * or start a page walk if this was the last-level TLB). + */ + void translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt); + + class TLBEvent : public Event + { + private: + GpuTLB *tlb; + Addr virtPageAddr; + /** + * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK + */ + tlbOutcome outcome; + PacketPtr pkt; + + public: + TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, + PacketPtr _pkt); + + void process(); + const char *description() const; + + // updateOutcome updates the tlbOutcome of a TLBEvent + void updateOutcome(tlbOutcome _outcome); + Addr getTLBEventVaddr(); + }; + + std::unordered_map<Addr, TLBEvent*> translationReturnEvent; + + // this FIFO queue keeps track of the virt. page addresses + // that are pending cleanup + std::queue<Addr> cleanupQueue; + + // the cleanupEvent is scheduled after a TLBEvent triggers in order to + // free memory and do the required clean-up + void cleanup(); + + EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent; + + /** + * This hash map will use the virtual page address as a key + * and will keep track of total number of accesses per page + */ + + struct AccessInfo + { + unsigned int lastTimeAccessed; // last access to this page + unsigned int accessesPerPage; + // need to divide it by accessesPerPage at the end + unsigned int totalReuseDistance; + + /** + * The field below will help us compute the access distance, + * that is the number of (coalesced) TLB accesses that + * happened in between each access to this page + * + * localTLBAccesses[x] is the value of localTLBNumAccesses + * when the page <Addr> was accessed for the <x>th time + */ + std::vector<unsigned int> localTLBAccesses; + unsigned int sumDistance; + unsigned int meanDistance; + }; + + typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; + AccessPatternTable TLBFootprint; + + // Called at the end of simulation to dump page access stats. + void exitCallback(); + + EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent; + }; +} + +#endif // __GPU_TLB_HH__ diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh new file mode 100644 index 000000000..9f358e23c --- /dev/null +++ b/src/gpu-compute/hsa_code.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_CODE_HH__ +#define __HSA_CODE_HH__ + +#include <string> +#include <vector> + +#include "arch/gpu_types.hh" +#include "config/the_gpu_isa.hh" + +class HsaKernelInfo; + +/* @class HsaCode + * base code object for the set of HSA kernels associated + * with a single application. this class provides the common + * methods for creating, accessing, and storing information + * about kernel and variable symbols, symbol name, memory + * segment sizes, and instruction count, etc. + */ + +class HsaCode +{ + public: + HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0), + _name(name) + { + } + + enum class MemorySegment { + NONE, + FLAT, + GLOBAL, + READONLY, + KERNARG, + GROUP, + PRIVATE, + SPILL, + ARG, + EXTSPACE0 + }; + + const std::string& name() const { return _name; } + int numInsts() const { return _insts.size(); } + std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; } + + void + setReadonlyData(uint8_t *_readonly_data) + { + readonly_data = _readonly_data; + } + + virtual int getSize(MemorySegment segment) const = 0; + virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0; + + uint8_t *readonly_data; + int funcarg_size; + + protected: + // An array that stores instruction indices (0 through kernel size) + // for a kernel passed to code object constructor as an argument. + std::vector<TheGpuISA::RawMachInst> _insts; + + private: + const std::string _name; +}; + +#endif // __HSA_CODE_HH__ diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh new file mode 100644 index 000000000..396913dac --- /dev/null +++ b/src/gpu-compute/hsa_kernel_info.hh @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSA_KERNEL_INFO_HH__ +#define __HSA_KERNEL_INFO_HH__ + +// This file defines the public interface between the HSA emulated +// driver and application programs. + +#include <cstdint> + +static const int HSA_GET_SIZES = 0x4801; +static const int HSA_GET_KINFO = 0x4802; +static const int HSA_GET_STRINGS = 0x4803; +static const int HSA_GET_CODE = 0x4804; +static const int HSA_GET_READONLY_DATA = 0x4805; +static const int HSA_GET_CU_CNT = 0x4806; +static const int HSA_GET_VSZ = 0x4807; + +// Return value (via buffer ptr) for HSA_GET_SIZES +struct HsaDriverSizes +{ + uint32_t num_kernels; + uint32_t string_table_size; + uint32_t code_size; + uint32_t readonly_size; +}; + +// HSA_GET_KINFO returns an array of num_kernels of these structs +struct HsaKernelInfo +{ + // byte offset into string table + uint32_t name_offs; + // byte offset into code array + uint32_t code_offs; + uint32_t static_lds_size; + uint32_t private_mem_size; + uint32_t spill_mem_size; + // Number of s registers + uint32_t sRegCount; + // Number of d registers + uint32_t dRegCount; + // Number of c registers + uint32_t cRegCount; +}; + +#endif // __HSA_KERNEL_INFO_HH__ diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc new file mode 100644 index 000000000..91dfb160e --- /dev/null +++ b/src/gpu-compute/hsa_object.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#include "gpu-compute/hsa_object.hh" + +#include <fstream> + +#include "gpu-compute/brig_object.hh" + +HsaObject::HsaObject(const std::string &fname) + : readonlyData(nullptr), filename(fname) +{ +} + +HsaObject* +HsaObject::createHsaObject(const std::string &fname) +{ + HsaObject *hsaObj = nullptr; + uint8_t *file_data = nullptr; + int file_length = 0; + + std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in | + std::ifstream::binary); + + assert(code_file.is_open()); + assert(code_file.good()); + + file_length = code_file.tellg(); + code_file.seekg(0, code_file.beg); + file_data = new uint8_t[file_length]; + code_file.read((char*)file_data, file_length); + code_file.close(); + + for (const auto &tryFile : tryFileFuncs) { + if ((hsaObj = tryFile(fname, file_length, file_data))) { + return hsaObj; + } + } + + delete[] file_data; + fatal("Unknown HSA object type for file: %s.\n", fname); + + return nullptr; +} diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh new file mode 100644 index 000000000..1f08f5d80 --- /dev/null +++ b/src/gpu-compute/hsa_object.hh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Anthony Gutierrez + */ + +#ifndef __HSA_OBJECT_HH__ +#define __HSA_OBJECT_HH__ + +#include <functional> +#include <string> +#include <vector> + +class HsaCode; + +/* @class HsaObject + * base loader object for HSA kernels. this class provides + * the base method definitions for loading, storing, and + * accessing HSA kernel objects into the simulator. + */ + +class HsaObject +{ + public: + HsaObject(const std::string &fileName); + + static HsaObject* createHsaObject(const std::string &fname); + static std::vector<std::function<HsaObject*(const std::string&, int, + uint8_t*)>> tryFileFuncs; + + virtual HsaCode* getKernel(const std::string &name) const = 0; + virtual HsaCode* getKernel(int i) const = 0; + virtual HsaCode* getFunction(const std::string &name) const = 0; + virtual int numKernels() const = 0; + + const std::string& name() const { return filename; } + + uint8_t *readonlyData; + + + protected: + const std::string filename; +}; + +#endif // __HSA_OBJECT_HH__ diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc new file mode 100644 index 000000000..b0ddf0161 --- /dev/null +++ b/src/gpu-compute/hsail_code.cc @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/hsail_code.hh" + +#include "arch/gpu_types.hh" +#include "arch/hsail/Brig.h" +#include "arch/hsail/operand.hh" +#include "config/the_gpu_isa.hh" +#include "debug/BRIG.hh" +#include "debug/HSAILObject.hh" +#include "gpu-compute/brig_object.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/kernel_cfg.hh" + +using namespace Brig; + +int getBrigDataTypeBytes(BrigType16_t t); + +HsailCode::HsailCode(const std::string &name_str) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ +} + +void +HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj, + StorageMap *objStorageMap) +{ + storageMap = objStorageMap; + + // set pointer so that decoding process can find this kernel context when + // needed + obj->currentCode = this; + + if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION && + code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) { + fatal("unexpected directive kind %d inside kernel/function init\n", + code_dir->base.kind); + } + + DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n", + code_dir->firstCodeBlockEntry); + + // clear these static vars so we can properly track the max index + // for this kernel + SRegOperand::maxRegIdx = 0; + DRegOperand::maxRegIdx = 0; + CRegOperand::maxRegIdx = 0; + setPrivateSize(0); + + const BrigBase *entryPtr = brigNext((BrigBase*)code_dir); + const BrigBase *endPtr = + obj->getCodeSectionEntry(code_dir->nextModuleEntry); + + int inst_idx = 0; + std::vector<GPUStaticInst*> instructions; + int funcarg_size_scope = 0; + + // walk through instructions in code section and directives in + // directive section in parallel, processing directives that apply + // when we reach the relevant code point. + while (entryPtr < endPtr) { + switch (entryPtr->kind) { + case BRIG_KIND_DIRECTIVE_VARIABLE: + { + const BrigDirectiveVariable *sym = + (const BrigDirectiveVariable*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_variable, symbol is: %s\n", + obj->getString(sym->name)); + + StorageElement *se = storageMap->addSymbol(sym, obj); + + if (sym->segment == BRIG_SEGMENT_PRIVATE) { + setPrivateSize(se->size); + } else { // spill + funcarg_size_scope += se->size; + } + } + break; + + case BRIG_KIND_DIRECTIVE_LABEL: + { + const BrigDirectiveLabel *lbl = + (const BrigDirectiveLabel*)entryPtr; + + DPRINTF(HSAILObject,"Initializing code, directive is " + "kind_label, label is: %s \n", + obj->getString(lbl->name)); + + labelMap.addLabel(lbl, inst_idx, obj); + } + break; + + case BRIG_KIND_DIRECTIVE_PRAGMA: + { + DPRINTF(HSAILObject, "Initializing code, directive " + "is kind_pragma\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_COMMENT: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_comment\n"); + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_start\n"); + + storageMap->resetOffset(BRIG_SEGMENT_ARG); + funcarg_size_scope = 0; + } + break; + + case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END: + { + DPRINTF(HSAILObject, "Initializing code, directive is " + "kind_arg_block_end\n"); + + funcarg_size = funcarg_size < funcarg_size_scope ? + funcarg_size_scope : funcarg_size; + } + break; + + case BRIG_KIND_DIRECTIVE_END: + DPRINTF(HSAILObject, "Initializing code, dircetive is " + "kind_end\n"); + + break; + + default: + if (entryPtr->kind >= BRIG_KIND_INST_BEGIN && + entryPtr->kind <= BRIG_KIND_INST_END) { + + BrigInstBase *instPtr = (BrigInstBase*)entryPtr; + TheGpuISA::MachInst machInst = { instPtr, obj }; + GPUStaticInst *iptr = decoder.decode(machInst); + + if (iptr) { + DPRINTF(HSAILObject, "Initializing code, processing inst " + "#%d idx %d: OPCODE=%d\n", + inst_idx, _insts.size(), instPtr->opcode); + + TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr); + iptr->instNum(inst_idx); + _insts.push_back(inst_num); + instructions.push_back(iptr); + } + ++inst_idx; + } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN && + entryPtr->kind < BRIG_KIND_OPERAND_END) { + warn("unexpected operand entry in code segment\n"); + } else { + // there are surely some more cases we will need to handle, + // but we'll deal with them as we find them. + fatal("unexpected directive kind %d inside kernel scope\n", + entryPtr->kind); + } + } + + entryPtr = brigNext(entryPtr); + } + + // compute Control Flow Graph for current kernel + ControlFlowInfo::assignImmediatePostDominators(instructions); + + max_sreg = SRegOperand::maxRegIdx; + max_dreg = DRegOperand::maxRegIdx; + max_creg = CRegOperand::maxRegIdx; + + obj->currentCode = nullptr; +} + +HsailCode::HsailCode(const std::string &name_str, + const BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap) + : HsaCode(name_str), private_size(-1), readonly_size(-1) +{ + init(code_dir, obj, objStorageMap); +} + +void +LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index, + const BrigObject *obj) +{ + std::string lbl_name = obj->getString(lblDir->name); + Label &lbl = map[lbl_name]; + + if (lbl.defined()) { + fatal("Attempt to redefine existing label %s\n", lbl_name); + } + + lbl.define(lbl_name, inst_index); + DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index); +} + +Label* +LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir, + const BrigObject *obj) +{ + std::string name = obj->getString(lblDir->name); + Label &lbl = map[name]; + lbl.checkName(name); + + return &lbl; +} + +int +getBrigDataTypeBytes(BrigType16_t t) +{ + switch (t) { + case BRIG_TYPE_S8: + case BRIG_TYPE_U8: + case BRIG_TYPE_B8: + return 1; + + case BRIG_TYPE_S16: + case BRIG_TYPE_U16: + case BRIG_TYPE_B16: + case BRIG_TYPE_F16: + return 2; + + case BRIG_TYPE_S32: + case BRIG_TYPE_U32: + case BRIG_TYPE_B32: + case BRIG_TYPE_F32: + return 4; + + case BRIG_TYPE_S64: + case BRIG_TYPE_U64: + case BRIG_TYPE_B64: + case BRIG_TYPE_F64: + return 8; + + case BRIG_TYPE_B1: + + default: + fatal("unhandled symbol data type %d", t); + return 0; + } +} + +StorageElement* +StorageSpace::addSymbol(const BrigDirectiveVariable *sym, + const BrigObject *obj) +{ + const char *sym_name = obj->getString(sym->name); + uint64_t size = 0; + uint64_t offset = 0; + + if (sym->type & BRIG_TYPE_ARRAY) { + size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY); + size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo); + + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type & + ~BRIG_TYPE_ARRAY)); + } else { + size = getBrigDataTypeBytes(sym->type); + offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type)); + } + + nextOffset = offset + size; + + DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n", + segmentNames[segment], sym_name, size, offset, sym->init); + + StorageElement* se = new StorageElement(sym_name, offset, size, sym); + elements.push_back(se); + elements_by_addr.insert(AddrRange(offset, offset + size - 1), se); + elements_by_brigptr[sym] = se; + + return se; +} + +StorageElement* +StorageSpace::findSymbol(std::string name) +{ + for (auto it : elements) { + if (it->name == name) { + return it; + } + } + + return nullptr; +} + +StorageElement* +StorageSpace::findSymbol(uint64_t addr) +{ + assert(elements_by_addr.size() > 0); + + auto se = elements_by_addr.find(addr); + + if (se == elements_by_addr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageElement* +StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr) +{ + assert(elements_by_brigptr.size() > 0); + + auto se = elements_by_brigptr.find(brigptr); + + if (se == elements_by_brigptr.end()) { + return nullptr; + } else { + return se->second; + } +} + +StorageMap::StorageMap(StorageMap *outerScope) + : outerScopeMap(outerScope) +{ + for (int i = 0; i < NumSegments; ++i) + space[i] = new StorageSpace((BrigSegment)i); +} + +StorageElement* +StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj) +{ + BrigSegment8_t segment = sym->segment; + + assert(segment >= Brig::BRIG_SEGMENT_FLAT); + assert(segment < NumSegments); + + return space[segment]->addSymbol(sym, obj); +} + +int +StorageMap::getSize(Brig::BrigSegment segment) +{ + assert(segment > Brig::BRIG_SEGMENT_GLOBAL); + assert(segment < NumSegments); + + if (segment != Brig::BRIG_SEGMENT_GROUP && + segment != Brig::BRIG_SEGMENT_READONLY) { + return space[segment]->getSize(); + } else { + int ret = space[segment]->getSize(); + + if (outerScopeMap) { + ret += outerScopeMap->getSize(segment); + } + + return ret; + } +} + +void +StorageMap::resetOffset(Brig::BrigSegment segment) +{ + space[segment]->resetOffset(); +} + +StorageElement* +StorageMap::findSymbol(BrigSegment segment, std::string name) +{ + StorageElement *se = space[segment]->findSymbol(name); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, name); + + return nullptr; +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(addr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, addr); + + return nullptr; + +} + +StorageElement* +StorageMap::findSymbol(Brig::BrigSegment segment, + const BrigDirectiveVariable *brigptr) +{ + StorageSpace *sp = space[segment]; + + if (!sp) { + // there is no memory in segment? + return nullptr; + } + + StorageElement *se = sp->findSymbol(brigptr); + + if (se) + return se; + + if (outerScopeMap) + return outerScopeMap->findSymbol(segment, brigptr); + + return nullptr; + +} diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh new file mode 100644 index 000000000..d9fbcc577 --- /dev/null +++ b/src/gpu-compute/hsail_code.hh @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __HSAIL_CODE_HH__ +#define __HSAIL_CODE_HH__ + +#include <cassert> +#include <list> +#include <map> +#include <string> +#include <vector> + +#include "arch/gpu_decoder.hh" +#include "arch/hsail/Brig.h" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "config/the_gpu_isa.hh" +#include "gpu-compute/hsa_code.hh" +#include "gpu-compute/hsa_kernel_info.hh" +#include "gpu-compute/misc.hh" + +class BrigObject; +class GPUStaticInst; + +inline int +popcount(uint64_t src, int sz) +{ + int cnt = 0; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + ++cnt; + src >>= 1; + } + + return cnt; +} + +inline int +firstbit(uint64_t src, int sz) +{ + int i; + + for (i = 0; i < sz; ++i) { + if (src & 1) + break; + src >>= 1; + } + + return i; +} + +inline int +lastbit(uint64_t src, int sz) +{ + int i0 = -1; + + for (int i = 0; i < sz; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + + return i0; +} + +inline int +signbit(uint64_t src, int sz) +{ + int i0 = -1; + + if (src & (1 << (sz - 1))) { + for (int i = 0; i < sz - 1; ++i) { + if (!(src & 1)) + i0 = i; + src >>= 1; + } + } else { + for (int i = 0; i < sz - 1; ++i) { + if (src & 1) + i0 = i; + src >>= 1; + } + } + + return i0; +} + +inline uint64_t +bitrev(uint64_t src, int sz) +{ + uint64_t r = 0; + + for (int i = 0; i < sz; ++i) { + r <<= 1; + if (src & 1) + r |= 1; + src >>= 1; + } + + return r; +} + +inline uint64_t +mul_hi(uint32_t a, uint32_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int32_t a, int32_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(uint64_t a, uint64_t b) +{ + return ((uint64_t)a * (uint64_t)b) >> 32; +} + +inline uint64_t +mul_hi(int64_t a, int64_t b) +{ + return ((int64_t)a * (int64_t)b) >> 32; +} + +inline uint64_t +mul_hi(double a, double b) +{ + return 0; +} + +class Label +{ + public: + std::string name; + int value; + + Label() : value(-1) + { + } + + bool defined() { return value != -1; } + + void + checkName(std::string &_name) + { + if (name.empty()) { + name = _name; + } else { + assert(name == _name); + } + } + + void + define(std::string &_name, int _value) + { + assert(!defined()); + assert(_value != -1); + value = _value; + checkName(_name); + } + + int + get() + { + assert(defined()); + return value; + } +}; + +class LabelMap +{ + std::map<std::string, Label> map; + + public: + LabelMap() { } + + void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index, + const BrigObject *obj); + + Label *refLabel(const Brig::BrigDirectiveLabel *lbl, + const BrigObject *obj); +}; + +const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN; + +extern const char *segmentNames[]; + +class StorageElement +{ + public: + std::string name; + uint64_t offset; + + uint64_t size; + const Brig::BrigDirectiveVariable *brigSymbol; + StorageElement(const char *_name, uint64_t _offset, int _size, + const Brig::BrigDirectiveVariable *sym) + : name(_name), offset(_offset), size(_size), brigSymbol(sym) + { + } +}; + +class StorageSpace +{ + typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*> + DirVarToSE_map; + + std::list<StorageElement*> elements; + AddrRangeMap<StorageElement*> elements_by_addr; + DirVarToSE_map elements_by_brigptr; + + uint64_t nextOffset; + Brig::BrigSegment segment; + + public: + StorageSpace(Brig::BrigSegment _class) + : nextOffset(0), segment(_class) + { + } + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(std::string name); + StorageElement* findSymbol(uint64_t addr); + StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr); + + int getSize() { return nextOffset; } + void resetOffset() { nextOffset = 0; } +}; + +class StorageMap +{ + StorageMap *outerScopeMap; + StorageSpace *space[NumSegments]; + + public: + StorageMap(StorageMap *outerScope = nullptr); + + StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym, + const BrigObject *obj); + + StorageElement* findSymbol(Brig::BrigSegment segment, std::string name); + StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr); + + StorageElement* findSymbol(Brig::BrigSegment segment, + const Brig::BrigDirectiveVariable *brigptr); + + // overloaded version to avoid casting + StorageElement* + findSymbol(Brig::BrigSegment8_t segment, std::string name) + { + return findSymbol((Brig::BrigSegment)segment, name); + } + + int getSize(Brig::BrigSegment segment); + void resetOffset(Brig::BrigSegment segment); +}; + +typedef enum +{ + BT_DEFAULT, + BT_B8, + BT_U8, + BT_U16, + BT_U32, + BT_U64, + BT_S8, + BT_S16, + BT_S32, + BT_S64, + BT_F16, + BT_F32, + BT_F64, + BT_NULL +} base_type_e; + +/* @class HsailCode + * the HsailCode class is used to store information + * about HSA kernels stored in the BRIG format. it holds + * all information about a kernel, function, or variable + * symbol and provides methods for accessing that + * information. + */ + +class HsailCode final : public HsaCode +{ + public: + TheGpuISA::Decoder decoder; + + StorageMap *storageMap; + LabelMap labelMap; + uint32_t kernarg_start; + uint32_t kernarg_end; + int32_t private_size; + + int32_t readonly_size; + + // We track the maximum register index used for each register + // class when we load the code so we can size the register files + // appropriately (i.e., one more than the max index). + uint32_t max_creg; // maximum c-register index + uint32_t max_sreg; // maximum s-register index + uint32_t max_dreg; // maximum d-register index + + HsailCode(const std::string &name_str, + const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, + StorageMap *objStorageMap); + + // this version is used to create a placeholder when + // we encounter a kernel-related directive before the + // kernel itself + HsailCode(const std::string &name_str); + + void init(const Brig::BrigDirectiveExecutable *code_dir, + const BrigObject *obj, StorageMap *objStorageMap); + + void + generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const + { + hsaKernelInfo->sRegCount = max_sreg + 1; + hsaKernelInfo->dRegCount = max_dreg + 1; + hsaKernelInfo->cRegCount = max_creg + 1; + + hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP); + + hsaKernelInfo->private_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8); + + hsaKernelInfo->spill_mem_size = + roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8); + } + + int + getSize(MemorySegment segment) const + { + Brig::BrigSegment brigSeg; + + switch (segment) { + case MemorySegment::NONE: + brigSeg = Brig::BRIG_SEGMENT_NONE; + break; + case MemorySegment::FLAT: + brigSeg = Brig::BRIG_SEGMENT_FLAT; + break; + case MemorySegment::GLOBAL: + brigSeg = Brig::BRIG_SEGMENT_GLOBAL; + break; + case MemorySegment::READONLY: + brigSeg = Brig::BRIG_SEGMENT_READONLY; + break; + case MemorySegment::KERNARG: + brigSeg = Brig::BRIG_SEGMENT_KERNARG; + break; + case MemorySegment::GROUP: + brigSeg = Brig::BRIG_SEGMENT_GROUP; + break; + case MemorySegment::PRIVATE: + brigSeg = Brig::BRIG_SEGMENT_PRIVATE; + break; + case MemorySegment::SPILL: + brigSeg = Brig::BRIG_SEGMENT_SPILL; + break; + case MemorySegment::ARG: + brigSeg = Brig::BRIG_SEGMENT_ARG; + break; + case MemorySegment::EXTSPACE0: + brigSeg = Brig::BRIG_SEGMENT_AMD_GCN; + break; + default: + fatal("Unknown BrigSegment type.\n"); + } + + return getSize(brigSeg); + } + + private: + int + getSize(Brig::BrigSegment segment) const + { + if (segment == Brig::BRIG_SEGMENT_PRIVATE) { + // with the code generated by new HSA compiler the assertion + // does not hold anymore.. + //assert(private_size != -1); + return private_size; + } else { + return storageMap->getSize(segment); + } + } + + public: + StorageElement* + findSymbol(Brig::BrigSegment segment, uint64_t addr) + { + return storageMap->findSymbol(segment, addr); + } + + void + setPrivateSize(int32_t _private_size) + { + private_size = _private_size; + } + + Label* + refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj) + { + return labelMap.refLabel(lbl, obj); + } +}; + +#endif // __HSAIL_CODE_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc new file mode 100644 index 000000000..7e0e10912 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.cc @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/kernel_cfg.hh" + +#include <algorithm> +#include <cassert> +#include <cstdio> +#include <cstring> +#include <iostream> +#include <iterator> +#include <map> +#include <string> + +#include "gpu-compute/gpu_static_inst.hh" + +void +ControlFlowInfo::assignImmediatePostDominators( + const std::vector<GPUStaticInst*>& instructions) +{ + ControlFlowInfo cfg(instructions); + cfg.findImmediatePostDominators(); +} + + +ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) : + instructions(insts) +{ + createBasicBlocks(); + connectBasicBlocks(); +} + +BasicBlock* +ControlFlowInfo::basicBlock(int inst_num) const { + for (auto& block: basicBlocks) { + int first_block_id = block->firstInstruction->instNum(); + if (inst_num >= first_block_id && + inst_num < first_block_id + block->size) { + return block.get(); + } + } + return nullptr; +} + + +GPUStaticInst* +ControlFlowInfo::lastInstruction(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + + return instructions.at(block->firstInstruction->instNum() + + block->size - 1); +} + +BasicBlock* +ControlFlowInfo::postDominator(const BasicBlock* block) const +{ + if (block->isExit()) { + return nullptr; + } + return basicBlock(lastInstruction(block)->ipdInstNum()); +} + +void +ControlFlowInfo::createBasicBlocks() +{ + assert(!instructions.empty()); + std::set<int> leaders; + // first instruction is a leader + leaders.insert(0); + for (int i = 1; i < instructions.size(); i++) { + GPUStaticInst* instruction = instructions[i]; + if (instruction->o_type == Enums::OT_BRANCH) { + const int target_pc = instruction->getTargetPc(); + leaders.insert(target_pc); + leaders.insert(i + 1); + } + } + + size_t block_size = 0; + for (int i = 0; i < instructions.size(); i++) { + if (leaders.find(i) != leaders.end()) { + uint32_t id = basicBlocks.size(); + if (id > 0) { + basicBlocks.back()->size = block_size; + } + block_size = 0; + basicBlocks.emplace_back(new BasicBlock(id, instructions[i])); + } + block_size++; + } + basicBlocks.back()->size = block_size; + // exit basic block + basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr)); +} + +void +ControlFlowInfo::connectBasicBlocks() +{ + BasicBlock* exit_bb = basicBlocks.back().get(); + for (auto& bb : basicBlocks) { + if (bb->isExit()) { + break; + } + GPUStaticInst* last = lastInstruction(bb.get()); + if (last->o_type == Enums::OT_RET) { + bb->successorIds.insert(exit_bb->id); + break; + } + if (last->o_type == Enums::OT_BRANCH) { + const uint32_t target_pc = last->getTargetPc(); + BasicBlock* target_bb = basicBlock(target_pc); + bb->successorIds.insert(target_bb->id); + } + + // Unconditional jump instructions have a unique successor + if (!last->unconditionalJumpInstruction()) { + BasicBlock* next_bb = basicBlock(last->instNum() + 1); + bb->successorIds.insert(next_bb->id); + } + } +} + + +// In-place set intersection +static void +intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b) +{ + std::set<uint32_t>::iterator it = a.begin(); + while (it != a.end()) { + it = b.find(*it) != b.end() ? ++it : a.erase(it); + } +} + + +void +ControlFlowInfo::findPostDominators() +{ + // the only postdominator of the exit block is itself + basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id); + //copy all basic blocks to all postdominator lists except for exit block + for (auto& block : basicBlocks) { + if (!block->isExit()) { + for (uint32_t i = 0; i < basicBlocks.size(); i++) { + block->postDominatorIds.insert(i); + } + } + } + + bool change = true; + while (change) { + change = false; + for (int h = basicBlocks.size() - 2; h >= 0; --h) { + size_t num_postdominators = + basicBlocks[h]->postDominatorIds.size(); + for (int s : basicBlocks[h]->successorIds) { + intersect(basicBlocks[h]->postDominatorIds, + basicBlocks[s]->postDominatorIds); + } + basicBlocks[h]->postDominatorIds.insert(h); + change |= (num_postdominators + != basicBlocks[h]->postDominatorIds.size()); + } + } +} + + +// In-place set difference +static void +setDifference(std::set<uint32_t>&a, + const std::set<uint32_t>& b, uint32_t exception) +{ + for (uint32_t b_elem : b) { + if (b_elem != exception) { + a.erase(b_elem); + } + } +} + +void +ControlFlowInfo::findImmediatePostDominators() +{ + assert(basicBlocks.size() > 1); // Entry and exit blocks must be present + + findPostDominators(); + + for (auto& basicBlock : basicBlocks) { + if (basicBlock->isExit()) { + continue; + } + std::set<uint32_t> candidates = basicBlock->postDominatorIds; + candidates.erase(basicBlock->id); + for (uint32_t postDominatorId : basicBlock->postDominatorIds) { + if (postDominatorId != basicBlock->id) { + setDifference(candidates, + basicBlocks[postDominatorId]->postDominatorIds, + postDominatorId); + } + } + assert(candidates.size() == 1); + GPUStaticInst* last_instruction = lastInstruction(basicBlock.get()); + BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get(); + if (!ipd_block->isExit()) { + GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction; + last_instruction->ipdInstNum(ipd_first_inst->instNum()); + } else { + last_instruction->ipdInstNum(last_instruction->instNum() + 1); + } + } +} + +void +ControlFlowInfo::printPostDominators() const +{ + for (auto& block : basicBlocks) { + std::cout << "PD(" << block->id << ") = {"; + std::copy(block->postDominatorIds.begin(), + block->postDominatorIds.end(), + std::ostream_iterator<uint32_t>(std::cout, ", ")); + std::cout << "}" << std::endl; + } +} + +void +ControlFlowInfo::printImmediatePostDominators() const +{ + for (const auto& block : basicBlocks) { + if (block->isExit()) { + continue; + } + std::cout << "IPD(" << block->id << ") = "; + std::cout << postDominator(block.get())->id << ", "; + } + std::cout << std::endl; +} +void +ControlFlowInfo::printBasicBlocks() const +{ + for (GPUStaticInst* inst : instructions) { + int inst_num = inst->instNum(); + std::cout << inst_num << " [" << basicBlock(inst_num)->id + << "]: " << inst->disassemble(); + if (inst->o_type == Enums::OT_BRANCH) { + std::cout << ", PC = " << inst->getTargetPc(); + } + std::cout << std::endl; + } +} + +void +ControlFlowInfo::printBasicBlockDot() const +{ + printf("digraph {\n"); + for (const auto& basic_block : basicBlocks) { + printf("\t"); + for (uint32_t successorId : basic_block->successorIds) { + printf("%d -> %d; ", basic_block->id, successorId); + } + printf("\n"); + } + printf("}\n"); +} diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh new file mode 100644 index 000000000..74ea861d8 --- /dev/null +++ b/src/gpu-compute/kernel_cfg.hh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __KERNEL_CFG_HH__ +#define __KERNEL_CFG_HH__ + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <set> +#include <vector> + + +class GPUStaticInst; +class HsailCode; + +struct BasicBlock +{ + BasicBlock(uint32_t num, GPUStaticInst* begin) : + id(num), size(0), firstInstruction(begin) + { + } + + bool + isEntry() const + { + return !id; + } + + bool + isExit() const + { + return !size; + } + + /** + * Unique identifier for the block within a given kernel. + */ + const uint32_t id; + + /** + * Number of instructions contained in the block + */ + size_t size; + + /** + * Pointer to first instruction of the block. + */ + GPUStaticInst* firstInstruction; + + /** + * Identifiers of the blocks that follow (are reachable from) this block. + */ + std::set<uint32_t> successorIds; + + /** + * Identifiers of the blocks that will be visited from this block. + */ + std::set<uint32_t> postDominatorIds; +}; + +class ControlFlowInfo +{ +public: + + /** + * Compute immediate post-dominator instruction for kernel instructions. + */ + static void assignImmediatePostDominators( + const std::vector<GPUStaticInst*>& instructions); + +private: + ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions); + + GPUStaticInst* lastInstruction(const BasicBlock* block) const; + + BasicBlock* basicBlock(int inst_num) const; + + BasicBlock* postDominator(const BasicBlock* block) const; + + void createBasicBlocks(); + + void connectBasicBlocks(); + + void findPostDominators(); + + void findImmediatePostDominators(); + + void printBasicBlocks() const; + + void printBasicBlockDot() const; + + void printPostDominators() const; + + void printImmediatePostDominators() const; + + std::vector<std::unique_ptr<BasicBlock>> basicBlocks; + std::vector<GPUStaticInst*> instructions; +}; + +#endif // __KERNEL_CFG_HH__ diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc new file mode 100644 index 000000000..91ee8009a --- /dev/null +++ b/src/gpu-compute/lds_state.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#include "gpu-compute/lds_state.hh" + +#include <array> +#include <cstdio> +#include <cstdlib> + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +/** + * the default constructor that works with SWIG + */ +LdsState::LdsState(const Params *params) : + MemObject(params), + tickEvent(this), + cuPort(name() + ".port", this), + maximumSize(params->size), + range(params->range), + bankConflictPenalty(params->bankConflictPenalty), + banks(params->banks) +{ + fatal_if(params->banks <= 0, + "Number of LDS banks should be positive number"); + fatal_if((params->banks & (params->banks - 1)) != 0, + "Number of LDS banks should be a power of 2"); + fatal_if(params->size <= 0, + "cannot allocate an LDS with a size less than 1"); + fatal_if(params->size % 2, + "the LDS should be an even number"); +} + +/** + * Needed by the SWIG compiler + */ +LdsState * +LdsStateParams::create() +{ + return new LdsState(this); +} + +/** + * set the parent and name based on the parent + */ +void +LdsState::setParent(ComputeUnit *x_parent) +{ + // check that this gets assigned to the same thing each time + fatal_if(!x_parent, "x_parent should not be nullptr"); + fatal_if(x_parent == parent, + "should not be setting the parent twice"); + + parent = x_parent; + _name = x_parent->name() + ".LdsState"; +} + +/** + * derive the gpu mem packet from the packet and then count the bank conflicts + */ +unsigned +LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) +{ + Packet::SenderState *baseSenderState = packet->senderState; + while (baseSenderState->predecessor) { + baseSenderState = baseSenderState->predecessor; + } + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState); + + fatal_if(!senderState, + "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + return countBankConflicts(gpuDynInst, bankAccesses); +} + +// Count the total number of bank conflicts for the local memory packet +unsigned +LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses) +{ + int bank_conflicts = 0; + std::vector<int> bank; + // the number of LDS banks being touched by the memory instruction + int numBanks = std::min(parent->wfSize(), banks); + // if the wavefront size is larger than the number of LDS banks, we + // need to iterate over all work items to calculate the total + // number of bank conflicts + int groups = (parent->wfSize() > numBanks) ? + (parent->wfSize() / numBanks) : 1; + for (int i = 0; i < groups; i++) { + // Address Array holding all the work item addresses of an instruction + std::vector<Addr> addr_array; + addr_array.resize(numBanks, 0); + bank.clear(); + bank.resize(banks, 0); + int max_bank = 0; + + // populate the address array for all active work items + for (int j = 0; j < numBanks; j++) { + if (gpuDynInst->exec_mask[(i*numBanks)+j]) { + addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; + } else { + addr_array[j] = std::numeric_limits<Addr>::max(); + } + } + + if (gpuDynInst->m_op == Enums::MO_LD || + gpuDynInst->m_op == Enums::MO_ST) { + // mask identical addresses + for (int j = 0; j < numBanks; ++j) { + for (int j0 = 0; j0 < j; j0++) { + if (addr_array[j] != std::numeric_limits<Addr>::max() + && addr_array[j] == addr_array[j0]) { + addr_array[j] = std::numeric_limits<Addr>::max(); + } + } + } + } + // calculate bank conflicts + for (int j = 0; j < numBanks; ++j) { + if (addr_array[j] != std::numeric_limits<Addr>::max()) { + int bankId = addr_array[j] % banks; + bank[bankId]++; + max_bank = std::max(max_bank, bank[bankId]); + // Count the number of LDS banks accessed. + // Since we have masked identical addresses all remaining + // accesses will need to be serialized if they access + // the same bank (bank conflict). + (*numBankAccesses)++; + } + } + bank_conflicts += max_bank; + } + panic_if(bank_conflicts > parent->wfSize(), + "Max bank conflicts should match num of work items per instr"); + return bank_conflicts; +} + +/** + * receive the packet from the CU + */ +bool +LdsState::CuSidePort::recvTimingReq(PacketPtr packet) +{ + return ownerLds->processPacket(packet); +} + +GPUDynInstPtr +LdsState::getDynInstr(PacketPtr packet) +{ + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>( + packet->senderState); + return ss->getMemInst(); +} + +/** + * process an incoming packet, add it to the return queue + */ +bool +LdsState::processPacket(PacketPtr packet) +{ + unsigned bankAccesses = 0; + // the number of conflicts this packet will have when accessing the LDS + unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); + // count the total number of physical LDS bank accessed + parent->ldsBankAccesses += bankAccesses; + // count the LDS bank conflicts. A number set to 1 indicates one + // access per bank maximum so there are no bank conflicts + parent->ldsBankConflictDist.sample(bankConflicts-1); + + GPUDynInstPtr dynInst = getDynInstr(packet); + // account for the LDS bank conflict overhead + int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : + (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + parent->loadBusLength(); + // delay for accessing the LDS + Tick processingTime = + parent->shader->ticks(bankConflicts * bankConflictPenalty) + + parent->shader->ticks(busLength); + // choose (delay + last packet in queue) or (now + delay) as the time to + // return this + Tick doneAt = earliestReturnTime() + processingTime; + // then store it for processing + return returnQueuePush(std::make_pair(doneAt, packet)); +} + +/** + * add this to the queue of packets to be returned + */ +bool +LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) +{ + // TODO add time limits (e.g. one packet per cycle) and queue size limits + // and implement flow control + returnQueue.push(thePair); + + // if there is no set wakeup time, look through the queue + if (!tickEvent.scheduled()) { + process(); + } + + return true; +} + +/** + * receive a packet in functional mode + */ +void +LdsState::CuSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("not implemented"); +} + +/** + * receive a retry for a response + */ +void +LdsState::CuSidePort::recvRespRetry() +{ + // TODO verify that this is the right way to do this + assert(ownerLds->isRetryResp()); + ownerLds->setRetryResp(false); + ownerLds->process(); +} + +/** + * receive a retry + */ +void +LdsState::CuSidePort::recvRetry() +{ + fatal("not implemented"); +} + +/** + * look for packets to return at this time + */ +bool +LdsState::process() +{ + Tick now = clockEdge(); + + // send back completed packets + while (!returnQueue.empty() && returnQueue.front().first <= now) { + PacketPtr packet = returnQueue.front().second; + + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>( + packet->senderState); + + GPUDynInstPtr gpuDynInst = ss->getMemInst(); + + gpuDynInst->initiateAcc(gpuDynInst); + + packet->makeTimingResponse(); + + returnQueue.pop(); + + bool success = cuPort.sendTimingResp(packet); + + if (!success) { + retryResp = true; + panic("have not handled timing responses being NACK'd when sent" + "back"); + } + } + + // determine the next wakeup time + if (!returnQueue.empty()) { + + Tick next = returnQueue.front().first; + + if (tickEvent.scheduled()) { + + if (next < tickEvent.when()) { + + tickEvent.deschedule(); + tickEvent.schedule(next); + } + } else { + tickEvent.schedule(next); + } + } + + return true; +} + +/** + * wake up at this time and perform specified actions + */ +void +LdsState::TickEvent::process() +{ + ldsState->process(); +} + +/** + * + */ +void +LdsState::regStats() +{ +} diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh new file mode 100644 index 000000000..89f08a1d3 --- /dev/null +++ b/src/gpu-compute/lds_state.hh @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#ifndef __LDS_STATE_HH__ +#define __LDS_STATE_HH__ + +#include <array> +#include <queue> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/misc.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "params/LdsState.hh" + +class ComputeUnit; + +/** + * this represents a slice of the overall LDS, intended to be associated with an + * individual workgroup + */ +class LdsChunk +{ + public: + LdsChunk(const uint32_t x_size): + chunk(x_size) + { + } + + LdsChunk() {} + + /** + * a read operation + */ + template<class T> + T + read(const uint32_t index) + { + fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + return *p0; + } + + /** + * a write operation + */ + template<class T> + void + write(const uint32_t index, const T value) + { + fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + T *p0 = (T *) (&(chunk.at(index))); + *p0 = value; + } + + /** + * get the size of this chunk + */ + std::vector<uint8_t>::size_type + size() const + { + return chunk.size(); + } + + protected: + // the actual data store for this slice of the LDS + std::vector<uint8_t> chunk; +}; + +// Local Data Share (LDS) State per Wavefront (contents of the LDS region +// allocated to the WorkGroup of this Wavefront) +class LdsState: public MemObject +{ + protected: + + /** + * an event to allow event-driven execution + */ + class TickEvent: public Event + { + protected: + + LdsState *ldsState = nullptr; + + Tick nextTick = 0; + + public: + + TickEvent(LdsState *_ldsState) : + ldsState(_ldsState) + { + } + + virtual void + process(); + + void + schedule(Tick when) + { + mainEventQueue[0]->schedule(this, when); + } + + void + deschedule() + { + mainEventQueue[0]->deschedule(this); + } + }; + + /** + * CuSidePort is the LDS Port closer to the CU side + */ + class CuSidePort: public SlavePort + { + public: + CuSidePort(const std::string &_name, LdsState *_ownerLds) : + SlavePort(_name, _ownerLds), ownerLds(_ownerLds) + { + } + + protected: + LdsState *ownerLds; + + virtual bool + recvTimingReq(PacketPtr pkt); + + virtual Tick + recvAtomic(PacketPtr pkt) + { + return 0; + } + + virtual void + recvFunctional(PacketPtr pkt); + + virtual void + recvRangeChange() + { + } + + virtual void + recvRetry(); + + virtual void + recvRespRetry(); + + virtual AddrRangeList + getAddrRanges() const + { + AddrRangeList ranges; + ranges.push_back(ownerLds->getAddrRange()); + return ranges; + } + + template<typename T> + void + loadData(PacketPtr packet); + + template<typename T> + void + storeData(PacketPtr packet); + + template<typename T> + void + atomicOperation(PacketPtr packet); + }; + + protected: + + // the lds reference counter + // The key is the workgroup ID and dispatch ID + // The value is the number of wavefronts that reference this LDS, as + // wavefronts are launched, the counter goes up for that workgroup and when + // they return it decreases, once it reaches 0 then this chunk of the LDS is + // returned to the available pool. However,it is deallocated on the 1->0 + // transition, not whenever the counter is 0 as it always starts with 0 when + // the workgroup asks for space + std::unordered_map<uint32_t, + std::unordered_map<uint32_t, int32_t>> refCounter; + + // the map that allows workgroups to access their own chunk of the LDS + std::unordered_map<uint32_t, + std::unordered_map<uint32_t, LdsChunk>> chunkMap; + + // an event to allow the LDS to wake up at a specified time + TickEvent tickEvent; + + // the queue of packets that are going back to the CU after a + // read/write/atomic op + // TODO need to make this have a maximum size to create flow control + std::queue<std::pair<Tick, PacketPtr>> returnQueue; + + // whether or not there are pending responses + bool retryResp = false; + + bool + process(); + + GPUDynInstPtr + getDynInstr(PacketPtr packet); + + bool + processPacket(PacketPtr packet); + + unsigned + countBankConflicts(PacketPtr packet, unsigned *bankAccesses); + + unsigned + countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses); + + public: + typedef LdsStateParams Params; + + LdsState(const Params *params); + + // prevent copy construction + LdsState(const LdsState&) = delete; + + ~LdsState() + { + parent = nullptr; + } + + const Params * + params() const + { + return dynamic_cast<const Params *>(_params); + } + + bool + isRetryResp() const + { + return retryResp; + } + + void + setRetryResp(const bool value) + { + retryResp = value; + } + + // prevent assignment + LdsState & + operator=(const LdsState &) = delete; + + /** + * use the dynamic wave id to create or just increase the reference count + */ + int + increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + fatal_if(refCount < 0, + "reference count should not be below zero"); + return ++refCounter[dispatchId][wgId]; + } + + /** + * decrease the reference count after making sure it is in the list + * give back this chunk if the ref counter has reached 0 + */ + int + decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId) + { + int refCount = getRefCounter(dispatchId, wgId); + + fatal_if(refCount <= 0, + "reference count should not be below zero or at zero to" + "decrement"); + + refCounter[dispatchId][wgId]--; + + if (refCounter[dispatchId][wgId] == 0) { + releaseSpace(dispatchId, wgId); + return 0; + } else { + return refCounter[dispatchId][wgId]; + } + } + + /** + * return the current reference count for this workgroup id + */ + int + getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const + { + auto dispatchIter = chunkMap.find(dispatchId); + fatal_if(dispatchIter == chunkMap.end(), + "could not locate this dispatch id [%d]", dispatchId); + + auto workgroup = dispatchIter->second.find(wgId); + fatal_if(workgroup == dispatchIter->second.end(), + "could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + + auto refCountIter = refCounter.find(dispatchId); + if (refCountIter == refCounter.end()) { + fatal("could not locate this dispatch id [%d]", dispatchId); + } else { + auto workgroup = refCountIter->second.find(wgId); + if (workgroup == refCountIter->second.end()) { + fatal("could not find this workgroup id within this dispatch id" + " did[%d] wgid[%d]", dispatchId, wgId); + } else { + return refCounter.at(dispatchId).at(wgId); + } + } + + fatal("should not reach this point"); + return 0; + } + + /** + * assign a parent and request this amount of space be set aside + * for this wgid + */ + LdsChunk * + reserveSpace(const uint32_t dispatchId, const uint32_t wgId, + const uint32_t size) + { + if (chunkMap.find(dispatchId) != chunkMap.end()) { + fatal_if( + chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(), + "duplicate workgroup ID asking for space in the LDS " + "did[%d] wgid[%d]", dispatchId, wgId); + } + + fatal_if(bytesAllocated + size > maximumSize, + "request would ask for more space than is available"); + + bytesAllocated += size; + + chunkMap[dispatchId].emplace(wgId, LdsChunk(size)); + // make an entry for this workgroup + refCounter[dispatchId][wgId] = 0; + + return &chunkMap[dispatchId][wgId]; + } + + bool + returnQueuePush(std::pair<Tick, PacketPtr> thePair); + + Tick + earliestReturnTime() const + { + // TODO set to max(lastCommand+1, curTick()) + return returnQueue.empty() ? curTick() : returnQueue.back().first; + } + + void + setParent(ComputeUnit *x_parent); + + void + regStats(); + + // accessors + ComputeUnit * + getParent() const + { + return parent; + } + + std::string + getName() + { + return _name; + } + + int + getBanks() const + { + return banks; + } + + ComputeUnit * + getComputeUnit() const + { + return parent; + } + + int + getBankConflictPenalty() const + { + return bankConflictPenalty; + } + + /** + * get the allocated size for this workgroup + */ + std::size_t + ldsSize(const uint32_t x_wgId) + { + return chunkMap[x_wgId].size(); + } + + AddrRange + getAddrRange() const + { + return range; + } + + virtual BaseSlavePort & + getSlavePort(const std::string& if_name, PortID idx) + { + if (if_name == "cuPort") { + // TODO need to set name dynamically at this point? + return cuPort; + } else { + fatal("cannot resolve the port name " + if_name); + } + } + + /** + * can this much space be reserved for a workgroup? + */ + bool + canReserve(uint32_t x_size) const + { + return bytesAllocated + x_size <= maximumSize; + } + + private: + /** + * give back the space + */ + bool + releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId) + { + auto dispatchIter = chunkMap.find(x_dispatchId); + + if (dispatchIter == chunkMap.end()) { + fatal("dispatch id not found [%d]", x_dispatchId); + } else { + auto workgroupIter = dispatchIter->second.find(x_wgId); + if (workgroupIter == dispatchIter->second.end()) { + fatal("workgroup id [%d] not found in dispatch id [%d]", + x_wgId, x_dispatchId); + } + } + + fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(), + "releasing more space than was allocated"); + + bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size(); + chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId)); + return true; + } + + // the port that connects this LDS to its owner CU + CuSidePort cuPort; + + ComputeUnit* parent = nullptr; + + std::string _name; + + // the number of bytes currently reserved by all workgroups + int bytesAllocated = 0; + + // the size of the LDS, the most bytes available + int maximumSize; + + // Address range of this memory + AddrRange range; + + // the penalty, in cycles, for each LDS bank conflict + int bankConflictPenalty = 0; + + // the number of banks in the LDS underlying data store + int banks = 0; +}; + +#endif // __LDS_STATE_HH__ diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc new file mode 100644 index 000000000..7f919c5f4 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/local_memory_pipeline.hh" + +#include "debug/GPUPort.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size) +{ +} + +void +LocalMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".LocalMemPipeline"; +} + +void +LocalMemPipeline::exec() +{ + // apply any returned shared (LDS) memory operations + GPUDynInstPtr m = !lmReturnedRequests.empty() ? + lmReturnedRequests.front() : nullptr; + + bool accessVrf = true; + if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + accessVrf = + w->computeUnit->vrf[m->simdId]-> + vrfOperandAccessReady(m->seqNum(), w, m, + VrfAccessType::WRITE); + } + + if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && + computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return + || computeUnit->wfWait.at(m->pipeId).rdy())) { + if (m->v_type == VT_32 && m->m_type == Enums::M_U8) + doSmReturn<uint32_t, uint8_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U16) + doSmReturn<uint32_t, uint16_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_U32) + doSmReturn<uint32_t, uint32_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S8) + doSmReturn<int32_t, int8_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S16) + doSmReturn<int32_t, int16_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_S32) + doSmReturn<int32_t, int32_t>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F16) + doSmReturn<float, Float16>(m); + else if (m->v_type == VT_32 && m->m_type == Enums::M_F32) + doSmReturn<float, float>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U8) + doSmReturn<uint64_t, uint8_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U16) + doSmReturn<uint64_t, uint16_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U32) + doSmReturn<uint64_t, uint32_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_U64) + doSmReturn<uint64_t, uint64_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S8) + doSmReturn<int64_t, int8_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S16) + doSmReturn<int64_t, int16_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S32) + doSmReturn<int64_t, int32_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_S64) + doSmReturn<int64_t, int64_t>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F16) + doSmReturn<double, Float16>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F32) + doSmReturn<double, float>(m); + else if (m->v_type == VT_64 && m->m_type == Enums::M_F64) + doSmReturn<double, double>(m); + } + + // If pipeline has executed a local memory instruction + // execute local memory packet and issue the packets + // to LDS + if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) { + + GPUDynInstPtr m = lmIssuedRequests.front(); + + bool returnVal = computeUnit->sendToLds(m); + if (!returnVal) { + DPRINTF(GPUPort, "packet was nack'd and put in retry queue"); + } + lmIssuedRequests.pop(); + } +} + +template<typename c0, typename c1> +void +LocalMemPipeline::doSmReturn(GPUDynInstPtr m) +{ + lmReturnedRequests.pop(); + Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; + + // Return data to registers + if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + std::vector<uint32_t> regVec; + for (int k = 0; k < m->n_reg; ++k) { + int dst = m->dst_reg+k; + + if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) + dst = m->dst_reg_vec[k]; + // virtual->physical VGPR mapping + int physVgpr = w->remap(dst,sizeof(c0),1); + // save the physical VGPR index + regVec.push_back(physVgpr); + c1 *p1 = &((c1*)m->d_data)[k * VSZ]; + + for (int i = 0; i < VSZ; ++i) { + if (m->exec_mask[i]) { + // write the value into the physical VGPR. This is a purely + // functional operation. No timing is modeled. + w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr, + *p1, i); + } + ++p1; + } + } + + // Schedule the write operation of the load data on the VRF. This simply + // models the timing aspect of the VRF write operation. It does not + // modify the physical VGPR. + loadVrfBankConflictCycles += + w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w, + regVec, sizeof(c0), m->time); + } + + // Decrement outstanding request count + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1); + + if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) + || MO_H(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm, + m->time, -1); + } + + if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->locMemToVrfBus.set(m->time); + if (computeUnit->shader->coissue_return == 0) + w->computeUnit->wfWait.at(m->pipeId).set(m->time); +} + +void +LocalMemPipeline::regStats() +{ + loadVrfBankConflictCycles + .name(name() + ".load_vrf_bank_conflict_cycles") + .desc("total number of cycles LDS data are delayed before updating " + "the VRF") + ; +} diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh new file mode 100644 index 000000000..a63d867d0 --- /dev/null +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __LOCAL_MEMORY_PIPELINE_HH__ +#define __LOCAL_MEMORY_PIPELINE_HH__ + +#include <queue> +#include <string> + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file local_memory_pipeline.hh + * + * The local memory pipeline issues newly created local memory packets + * from pipeline to the LDS. This stage also retires previously issued + * loads and stores that have returned from the LDS. + */ + +class ComputeUnit; +class Wavefront; + +class LocalMemPipeline +{ + public: + LocalMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m); + + std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; } + std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; } + + bool + isLMRespFIFOWrRdy() const + { + return lmReturnedRequests.size() < lmQueueSize; + } + + bool + isLMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (lmIssuedRequests.size() + pendReqs) < lmQueueSize; + } + + const std::string& name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int lmQueueSize; + Stats::Scalar loadVrfBankConflictCycles; + // Local Memory Request Fifo: all shared memory requests + // are issued to this FIFO from the memory pipelines + std::queue<GPUDynInstPtr> lmIssuedRequests; + + // Local Memory Response Fifo: all responses of shared memory + // requests are sent to this FIFO from LDS + std::queue<GPUDynInstPtr> lmReturnedRequests; +}; + +#endif // __LOCAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh new file mode 100644 index 000000000..4f8032832 --- /dev/null +++ b/src/gpu-compute/misc.hh @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __MISC_HH__ +#define __MISC_HH__ + +#include <bitset> +#include <memory> + +#include "base/misc.hh" + +class GPUDynInst; + +// wavefront size of the machine +static const int VSZ = 64; + +/* + This check is necessary because std::bitset only provides conversion to + unsigned long or unsigned long long via to_ulong() or to_ullong(). there are + a few places in the code where to_ullong() is used, however if VSZ is larger + than a value the host can support then bitset will throw a runtime exception. + + we should remove all use of to_long() or to_ullong() so we can have VSZ + greater than 64b, however until that is done this assert is required. + */ +static_assert(VSZ <= sizeof(unsigned long long) * 8, + "VSZ is larger than the host can support"); + +typedef std::bitset<VSZ> VectorMask; +typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr; + +class WaitClass +{ + public: + WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { } + void init(uint64_t *_tcnt, uint32_t _numStages=0) + { + tcnt = _tcnt; + numStages = _numStages; + } + + void set(uint32_t i) + { + fatal_if(nxtAvail > *tcnt, + "Can't allocate resource because it is busy!!!"); + nxtAvail = *tcnt + i; + } + void preset(uint32_t delay) + { + lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages); + } + bool rdy() const { return *tcnt >= nxtAvail; } + bool prerdy() const { return *tcnt >= lookAheadAvail; } + + private: + // timestamp indicating when resource will be available + uint64_t nxtAvail; + // timestamp indicating when resource will be available including + // pending uses of the resource (when there is a cycle gap between + // rdy() and set() + uint64_t lookAheadAvail; + // current timestamp + uint64_t *tcnt; + // number of stages between checking if a resource is ready and + // setting the resource's utilization + uint32_t numStages; +}; + +class Float16 +{ + public: + uint16_t val; + + Float16() { val = 0; } + + Float16(const Float16 &x) : val(x.val) { } + + Float16(float x) + { + uint32_t ai = *(uint32_t *)&x; + + uint32_t s = (ai >> 31) & 0x1; + uint32_t exp = (ai >> 23) & 0xff; + uint32_t mant = (ai >> 0) & 0x7fffff; + + if (exp == 0 || exp <= 0x70) { + exp = 0; + mant = 0; + } else if (exp == 0xff) { + exp = 0x1f; + } else if (exp >= 0x8f) { + exp = 0x1f; + mant = 0; + } else { + exp = exp - 0x7f + 0x0f; + } + + mant = mant >> 13; + + val = 0; + val |= (s << 15); + val |= (exp << 10); + val |= (mant << 0); + } + + operator float() const + { + uint32_t s = (val >> 15) & 0x1; + uint32_t exp = (val >> 10) & 0x1f; + uint32_t mant = (val >> 0) & 0x3ff; + + if (!exp) { + exp = 0; + mant = 0; + } else if (exp == 0x1f) { + exp = 0xff; + } else { + exp = exp - 0x0f + 0x7f; + } + + uint32_t val1 = 0; + val1 |= (s << 31); + val1 |= (exp << 23); + val1 |= (mant << 13); + + return *(float*)&val1; + } +}; + +#endif // __MISC_HH__ diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh new file mode 100644 index 000000000..d1ad35d4b --- /dev/null +++ b/src/gpu-compute/ndrange.hh @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __NDRANGE_HH__ +#define __NDRANGE_HH__ + +#include "base/types.hh" +#include "gpu-compute/qstruct.hh" + +struct NDRange +{ + // copy of the queue entry provided at dispatch + HsaQueueEntry q; + + // The current workgroup id (3 dimensions) + int wgId[3]; + // The number of workgroups in each dimension + int numWg[3]; + // The total number of workgroups + int numWgTotal; + + // The number of completed work groups + int numWgCompleted; + // The global workgroup ID + uint32_t globalWgId; + + // flag indicating whether all work groups have been launched + bool wg_disp_rem; + // kernel complete + bool execDone; + bool userDoorBellSet; + volatile bool *addrToNotify; + volatile uint32_t *numDispLeft; + int dispatchId; + int curTid; // Current thread id +}; + +#endif // __NDRANGE_HH__ diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc new file mode 100644 index 000000000..7f114706a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/of_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +OFSchedulingPolicy::chooseWave() +{ + // Set when policy choose a wave to schedule + bool waveChosen = false; + Wavefront *selectedWave = nullptr; + int selectedWaveID = -1; + uint32_t selectedPosition = 0; + + for (int position = 0; position < scheduleList->size(); ++position) { + Wavefront *curWave = scheduleList->at(position); + uint32_t curWaveID = curWave->wfDynId; + + // Choosed wave with the lowest wave ID + if (selectedWaveID == -1 || curWaveID < selectedWaveID) { + waveChosen = true; + selectedWaveID = curWaveID; + selectedWave = curWave; + selectedPosition = position; + } + } + + // Check to make sure ready list had atleast one schedulable wave + if (waveChosen) { + scheduleList->erase(scheduleList->begin() + selectedPosition); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh new file mode 100644 index 000000000..684e51a3a --- /dev/null +++ b/src/gpu-compute/of_scheduling_policy.hh @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __OF_SCHEDULING_POLICY_HH__ +#define __OF_SCHEDULING_POLICY_HH__ + +#include <cstddef> +#include <vector> + +#include "base/misc.hh" + +class Wavefront; + +// Oldest First where age is marked by the wave id +class OFSchedulingPolicy +{ + public: + OFSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector<Wavefront*> *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects the oldest wave from this list + std::vector<Wavefront*> *scheduleList; +}; + +#endif // __OF_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc new file mode 100644 index 000000000..b1bc6b1f3 --- /dev/null +++ b/src/gpu-compute/pool_manager.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/pool_manager.hh" + +PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize) + : _minAllocation(minAlloc), _poolSize(poolSize) +{ + assert(poolSize > 0); +} diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh new file mode 100644 index 000000000..2cb53ce72 --- /dev/null +++ b/src/gpu-compute/pool_manager.hh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __POOL_MANAGER_HH__ +#define __POOL_MANAGER_HH__ + +#include <cassert> +#include <cstdint> +#include <string> + +// Pool Manager Logic +class PoolManager +{ + public: + PoolManager(uint32_t minAlloc, uint32_t poolSize); + uint32_t minAllocation() { return _minAllocation; } + virtual std::string printRegion() = 0; + virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion) = 0; + virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0; + + virtual uint32_t allocateRegion(const uint32_t size, + uint32_t *reserved) = 0; + + virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0; + uint32_t poolSize() { return _poolSize; } + + private: + // minimum size that can be reserved per allocation + uint32_t _minAllocation; + // pool size in number of elements + uint32_t _poolSize; +}; + +#endif // __POOL_MANAGER_HH__ diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh new file mode 100644 index 000000000..092303c00 --- /dev/null +++ b/src/gpu-compute/qstruct.hh @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Brad Beckmann, Marc Orr + */ + +#ifndef __Q_STRUCT_HH__ +#define __Q_STRUCT_HH__ + +#include <bitset> +#include <cstdint> + +// Maximum number of arguments +static const int KER_NUM_ARGS = 32; +// Kernel argument buffer size +static const int KER_ARGS_LENGTH = 512; + +class LdsChunk; +struct NDRange; + +// Be very careful of alignment in this structure. The structure +// must compile to the same layout in both 32-bit and 64-bit mode. +struct HsaQueueEntry +{ + // Base pointer for array of instruction pointers + uint64_t code_ptr; + // Grid Size (3 dimensions) + uint32_t gdSize[3]; + // Workgroup Size (3 dimensions) + uint32_t wgSize[3]; + uint16_t sRegCount; + uint16_t dRegCount; + uint16_t cRegCount; + uint64_t privMemStart; + uint32_t privMemPerItem; + uint32_t privMemTotal; + uint64_t spillMemStart; + uint32_t spillMemPerItem; + uint32_t spillMemTotal; + uint64_t roMemStart; + uint32_t roMemTotal; + // Size (in bytes) of LDS + uint32_t ldsSize; + // Virtual Memory Id (unused right now) + uint32_t vmId; + + // Pointer to dependency chain (unused now) + uint64_t depends; + + // pointer to bool + uint64_t addrToNotify; + // pointer to uint32_t + uint64_t numDispLeft; + + // variables to pass arguments when running in standalone mode, + // will be removed when run.py and sh.cpp have been updated to + // use args and offset arrays + uint64_t arg1; + uint64_t arg2; + uint64_t arg3; + uint64_t arg4; + + // variables to pass arguments when running in cpu+gpu mode + uint8_t args[KER_ARGS_LENGTH]; + uint16_t offsets[KER_NUM_ARGS]; + uint16_t num_args; +}; + +// State used to start (or restart) a WF +struct WFContext +{ + // 32 bit values + // barrier state + int bar_cnt[VSZ]; + + // id (which WF in the WG) + int cnt; + + // more barrier state + int max_bar_cnt; + int old_barrier_cnt; + int barrier_cnt; + + // More Program Counter Stuff + uint32_t pc; + + // Program counter of the immediate post-dominator instruction + uint32_t rpc; + + // WG wide state (I don't see how to avoid redundancy here) + int cu_id; + uint32_t wg_id; + uint32_t barrier_id; + + // 64 bit values (these values depend on the wavefront size) + // masks + uint64_t init_mask; + uint64_t exec_mask; + + // private memory; + Addr privBase; + Addr spillBase; + + LdsChunk *ldsChunk; + + /* + * Kernel wide state + * This is a hack. This state should be moved through simulated memory + * during a yield. Though not much is being used here, so it's probably + * probably not a big deal. + * + * Just to add to this comment... The ndr is derived from simulated + * memory when the cl-runtime allocates an HsaQueueEntry and populates it + * for a kernel launch. So in theory the runtime should be able to keep + * that state around. Then a WF can reference it upon restart to derive + * kernel wide state. The runtime can deallocate the state when the + * kernel completes. + */ + NDRange *ndr; +}; + +// State that needs to be passed between the simulation and simulated app, a +// pointer to this struct can be passed through the depends field in the +// HsaQueueEntry struct +struct HostState +{ + // cl_event* has original HsaQueueEntry for init + uint64_t event; +}; + +// Total number of HSA queues +static const int HSAQ_NQUEUES = 8; + +// These values will eventually live in memory mapped registers +// and be settable by the kernel mode driver. + +// Number of entries in each HSA queue +static const int HSAQ_SIZE = 64; +// Address of first HSA queue index +static const int HSAQ_INDX_BASE = 0x10000ll; +// Address of first HSA queue +static const int HSAQ_BASE = 0x11000ll; +// Suggested start of HSA code +static const int HSA_CODE_BASE = 0x18000ll; + +// These are shortcuts for deriving the address of a specific +// HSA queue or queue index +#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n) +#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue)) +#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0)) +#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1)) +#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2)) + +/* + * Example code for writing to a queue + * + * void + * ToQueue(int n,struct fsaQueue *val) + * { + * int wi = *(int*)HSAQ_WI(n); + * int ri = *(int*)HSAQ_RI(n); + * int ci = *(int*)HSAQ_CI(n); + * + * if (ci - ri < HSAQ_SIZE) { + * (*(int*)HSAQ_CI(n))++; + * *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val; + * (*(int*)HSAQ_WI(n))++; + * } + * } + */ + +#endif // __Q_STRUCT_HH__ diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc new file mode 100644 index 000000000..5d3591901 --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/rr_scheduling_policy.hh" + +#include "gpu-compute/wavefront.hh" + +Wavefront* +RRSchedulingPolicy::chooseWave() +{ + Wavefront *selectedWave = nullptr; + + // Check to make sure ready list had atleast one schedulable wave + if (scheduleList->size()) { + // For RR policy, select the wave which is at the + // front of the list. The selected wave is popped + // out from the schedule list immediately after selection + // to avoid starvation. It is the responsibility of the + // module invoking the RR scheduler to make surei scheduling + // eligible waves are added to the back of the schedule + // list + selectedWave = scheduleList->front(); + scheduleList->erase(scheduleList->begin() + 0); + } else { + panic("Empty ready list"); + } + + return selectedWave; +} + +void +RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list) +{ + scheduleList = list; +} diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh new file mode 100644 index 000000000..780f294aa --- /dev/null +++ b/src/gpu-compute/rr_scheduling_policy.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __RR_SCHEDULING_POLICY_HH__ +#define __RR_SCHEDULING_POLICY_HH__ + +#include <inttypes.h> + +#include <cstddef> +#include <utility> +#include <vector> + +#include "base/misc.hh" + +class Wavefront; + +// Round-Robin pick among the list of ready waves +class RRSchedulingPolicy +{ + public: + RRSchedulingPolicy() : scheduleList(nullptr) { } + + Wavefront* chooseWave(); + void bindList(std::vector<Wavefront*> *list); + + private: + // List of waves which are participating in scheduling. + // This scheduler selects one wave from this list based on + // round robin policy + std::vector<Wavefront*> *scheduleList; +}; + +#endif // __RR_SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc new file mode 100644 index 000000000..068136026 --- /dev/null +++ b/src/gpu-compute/schedule_stage.cc @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/schedule_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +ScheduleStage::ScheduleStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes) +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + Scheduler newScheduler(p); + scheduler.push_back(newScheduler); + } +} + +ScheduleStage::~ScheduleStage() +{ + scheduler.clear(); + waveStatusList.clear(); +} + +void +ScheduleStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScheduleStage"; + + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + scheduler[j].bindList(&computeUnit->readyList[j]); + } + + for (int j = 0; j < numSIMDs; ++j) { + waveStatusList.push_back(&computeUnit->waveStatusList[j]); + } + + dispatchList = &computeUnit->dispatchList; +} + +void +ScheduleStage::arbitrate() +{ + // iterate over all Memory pipelines + for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) { + if (dispatchList->at(j).first) { + Wavefront *waveToMemPipe = dispatchList->at(j).first; + // iterate over all execution pipelines + for (int i = 0; i < numSIMDs + numMemUnits; ++i) { + if ((i != j) && (dispatchList->at(i).first)) { + Wavefront *waveToExePipe = dispatchList->at(i).first; + // if the two selected wavefronts are mapped to the same + // SIMD unit then they share the VRF + if (waveToMemPipe->simdId == waveToExePipe->simdId) { + int simdId = waveToMemPipe->simdId; + // Read VRF port arbitration: + // If there are read VRF port conflicts between the + // a memory and another instruction we drop the other + // instruction. We don't need to check for write VRF + // port conflicts because the memory instruction either + // does not need to write to the VRF (store) or will + // write to the VRF when the data comes back (load) in + // which case the arbiter of the memory pipes will + // resolve any conflicts + if (computeUnit->vrf[simdId]-> + isReadConflict(waveToMemPipe->wfSlotId, + waveToExePipe->wfSlotId)) { + // FIXME: The "second" member variable is never + // used in the model. I am setting it to READY + // simply to follow the protocol of setting it + // when the WF has an instruction ready to issue + waveStatusList[simdId]->at(waveToExePipe->wfSlotId) + .second = READY; + + dispatchList->at(i).first = nullptr; + dispatchList->at(i).second = EMPTY; + break; + } + } + } + } + } + } +} + +void +ScheduleStage::exec() +{ + for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + uint32_t readyListSize = computeUnit->readyList[j].size(); + + // If no wave is ready to be scheduled on the execution resource + // then skip scheduling for this execution resource + if (!readyListSize) { + continue; + } + + Wavefront *waveToBeDispatched = scheduler[j].chooseWave(); + dispatchList->at(j).first = waveToBeDispatched; + waveToBeDispatched->updateResources(); + dispatchList->at(j).second = FILLED; + + waveStatusList[waveToBeDispatched->simdId]->at( + waveToBeDispatched->wfSlotId).second = BLOCKED; + + assert(computeUnit->readyList[j].size() == readyListSize - 1); + } + // arbitrate over all shared resources among instructions being issued + // simultaneously + arbitrate(); +} + +void +ScheduleStage::regStats() +{ +} diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh new file mode 100644 index 000000000..26eb9a25b --- /dev/null +++ b/src/gpu-compute/schedule_stage.hh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULE_STAGE_HH__ +#define __SCHEDULE_STAGE_HH__ + +#include <utility> +#include <vector> + +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/scheduler.hh" +#include "gpu-compute/scoreboard_check_stage.hh" + +// Schedule or execution arbitration stage. +// From the pool of ready waves in the ready list, +// one wave is selected for each execution resource. +// The selection is made based on a scheduling policy + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +class ScheduleStage +{ + public: + ScheduleStage(const ComputeUnitParams *params); + ~ScheduleStage(); + void init(ComputeUnit *cu); + void exec(); + void arbitrate(); + // Stats related variables and methods + std::string name() { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + + // Each execution resource will have its own + // scheduler and a dispatch list + std::vector<Scheduler> scheduler; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*> + waveStatusList; + + // List of waves which will be dispatched to + // each execution resource. A FILLED implies + // dispatch list is non-empty and + // execution unit has something to execute + // this cycle. Currently, the dispatch list of + // an execution resource can hold only one wave because + // an execution resource can execute only one wave in a cycle. + std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList; + + std::string _name; +}; + +#endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc new file mode 100644 index 000000000..1cd0bfe55 --- /dev/null +++ b/src/gpu-compute/scheduler.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scheduler.hh" + +Scheduler::Scheduler(const ComputeUnitParams *p) +{ + if (p->execPolicy == "OLDEST-FIRST") { + schedPolicy = SCHED_POLICY::OF_POLICY; + } else if (p->execPolicy == "ROUND-ROBIN") { + schedPolicy = SCHED_POLICY::RR_POLICY; + } else { + fatal("Unimplemented scheduling policy"); + } +} + +Wavefront* +Scheduler::chooseWave() +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + return OFSchedPolicy.chooseWave(); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + return RRSchedPolicy.chooseWave(); + } else { + fatal("Unimplemented scheduling policy"); + } +} + +void +Scheduler::bindList(std::vector<Wavefront*> *list) +{ + if (schedPolicy == SCHED_POLICY::OF_POLICY) { + OFSchedPolicy.bindList(list); + } else if (schedPolicy == SCHED_POLICY::RR_POLICY) { + RRSchedPolicy.bindList(list); + } else { + fatal("Unimplemented scheduling policy"); + } +} diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh new file mode 100644 index 000000000..148ec9425 --- /dev/null +++ b/src/gpu-compute/scheduler.hh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULER_HH__ +#define __SCHEDULER_HH__ + +#include "gpu-compute/of_scheduling_policy.hh" +#include "gpu-compute/rr_scheduling_policy.hh" +#include "gpu-compute/scheduling_policy.hh" +#include "params/ComputeUnit.hh" + +enum SCHED_POLICY +{ + OF_POLICY = 0, + RR_POLICY +}; + +class Scheduler +{ + public: + Scheduler(const ComputeUnitParams *params); + Wavefront *chooseWave(); + void bindList(std::vector<Wavefront*> *list); + + private: + SCHED_POLICY schedPolicy; + SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy; + SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy; +}; + +#endif // __SCHEDULER_HH__ diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh new file mode 100644 index 000000000..b5e923c62 --- /dev/null +++ b/src/gpu-compute/scheduling_policy.hh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCHEDULING_POLICY_HH__ +#define __SCHEDULING_POLICY_HH__ + +#include <vector> + +template<typename Impl> +class SchedulingPolicy +{ + public: + Wavefront* chooseWave() { return policyImpl.chooseWave(); } + + void + bindList(std::vector<Wavefront*> *list) + { + return policyImpl.bindList(list); + } + + private: + Impl policyImpl; +}; + +#endif // __SCHEDULING_POLICY_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc new file mode 100644 index 000000000..0d856a9b0 --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "gpu-compute/scoreboard_check_stage.hh" + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/ComputeUnit.hh" + +ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p) + : numSIMDs(p->num_SIMDs), + numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), + numGlbMemPipes(p->num_global_mem_pipes), + numShrMemPipes(p->num_shared_mem_pipes), + vectorAluInstAvail(nullptr), + lastGlbMemSimd(-1), + lastShrMemSimd(-1), glbMemInstAvail(nullptr), + shrMemInstAvail(nullptr) +{ +} + +ScoreboardCheckStage::~ScoreboardCheckStage() +{ + readyList.clear(); + waveStatusList.clear(); + shrMemInstAvail = nullptr; + glbMemInstAvail = nullptr; +} + +void +ScoreboardCheckStage::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScoreboardCheckStage"; + + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList.push_back(&computeUnit->readyList[unitId]); + } + + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + waveStatusList.push_back(&computeUnit->waveStatusList[unitId]); + } + + vectorAluInstAvail = &computeUnit->vectorAluInstAvail; + glbMemInstAvail= &computeUnit->glbMemInstAvail; + shrMemInstAvail= &computeUnit->shrMemInstAvail; +} + +void +ScoreboardCheckStage::initStatistics() +{ + lastGlbMemSimd = -1; + lastShrMemSimd = -1; + *glbMemInstAvail = 0; + *shrMemInstAvail = 0; + + for (int unitId = 0; unitId < numSIMDs; ++unitId) + vectorAluInstAvail->at(unitId) = false; +} + +void +ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId) +{ + if (curWave->instructionBuffer.empty()) + return; + + // track which vector SIMD unit has at least one WV with a vector + // ALU as the oldest instruction in its Instruction buffer + vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) || + curWave->isOldestInstALU(); + + // track how many vector SIMD units have at least one WV with a + // vector Global memory instruction as the oldest instruction + // in its Instruction buffer + if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() || + curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId && + *glbMemInstAvail <= 1) { + (*glbMemInstAvail)++; + lastGlbMemSimd = unitId; + } + + // track how many vector SIMD units have at least one WV with a + // vector shared memory (LDS) instruction as the oldest instruction + // in its Instruction buffer + // TODO: parametrize the limit of the LDS units + if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) && + lastShrMemSimd != unitId) { + (*shrMemInstAvail)++; + lastShrMemSimd = unitId; + } +} + +void +ScoreboardCheckStage::exec() +{ + initStatistics(); + + // reset the ready list for all execution units; it will be + // constructed every cycle since resource availability may change + for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + readyList[unitId]->clear(); + } + + // iterate over the Wavefronts of all SIMD units + for (int unitId = 0; unitId < numSIMDs; ++unitId) { + for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) { + // reset the ready status of each wavefront + waveStatusList[unitId]->at(wvId).second = BLOCKED; + Wavefront *curWave = waveStatusList[unitId]->at(wvId).first; + collectStatistics(curWave, unitId); + + if (curWave->ready(Wavefront::I_ALU)) { + readyList[unitId]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_GLOBAL)) { + if (computeUnit->cedeSIMD(unitId, wvId)) { + continue; + } + + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_SHARED)) { + readyList[computeUnit->ShrMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_FLAT)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } else if (curWave->ready(Wavefront::I_PRIVATE)) { + readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); + waveStatusList[unitId]->at(wvId).second = READY; + } + } + } +} + +void +ScoreboardCheckStage::regStats() +{ +} diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh new file mode 100644 index 000000000..099597afb --- /dev/null +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __SCOREBOARD_CHECK_STAGE_HH__ +#define __SCOREBOARD_CHECK_STAGE_HH__ + +#include <cstdint> +#include <string> +#include <utility> +#include <vector> + +class ComputeUnit; +class Wavefront; + +struct ComputeUnitParams; + +enum WAVE_STATUS +{ + BLOCKED = 0, + READY +}; + +/* + * Scoreboard check stage. + * All wavefronts are analyzed to see if they are ready + * to be executed this cycle. Both structural and data + * hazards are considered while marking a wave "ready" + * for execution. After analysis, the ready waves are + * added to readyList. + */ +class ScoreboardCheckStage +{ + public: + ScoreboardCheckStage(const ComputeUnitParams* params); + ~ScoreboardCheckStage(); + void init(ComputeUnit *cu); + void exec(); + + // Stats related variables and methods + const std::string& name() const { return _name; } + void regStats(); + + private: + void collectStatistics(Wavefront *curWave, int unitId); + void initStatistics(); + ComputeUnit *computeUnit; + uint32_t numSIMDs; + uint32_t numMemUnits; + uint32_t numGlbMemPipes; + uint32_t numShrMemPipes; + + // flag per vector SIMD unit that is set when there is at least one + // WF that has a vector ALU instruction as the oldest in its + // Instruction Buffer + std::vector<bool> *vectorAluInstAvail; + int lastGlbMemSimd; + int lastShrMemSimd; + + int *glbMemInstAvail; + int *shrMemInstAvail; + // List of waves which are ready to be scheduled. + // Each execution resource has a ready list + std::vector<std::vector<Wavefront*>*> readyList; + + // Stores the status of waves. A READY implies the + // wave is ready to be scheduled this cycle and + // is already present in the readyList + std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*> + waveStatusList; + + std::string _name; +}; + +#endif // __SCOREBOARD_CHECK_STAGE_HH__ diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc new file mode 100644 index 000000000..e8d7946ff --- /dev/null +++ b/src/gpu-compute/shader.cc @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#include "gpu-compute/shader.hh" + +#include <limits> + +#include "arch/x86/linux/linux.hh" +#include "base/chunk_generator.hh" +#include "debug/GPUDisp.hh" +#include "debug/GPUMem.hh" +#include "debug/HSAIL.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/qstruct.hh" +#include "gpu-compute/wavefront.hh" +#include "mem/packet.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "sim/sim_exit.hh" + +Shader::Shader(const Params *p) : SimObject(p), + clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr), + cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing), + hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync), + separate_acquire_release(p->separate_acquire_release), coissue_return(1), + trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), + globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), + box_tick_cnt(0), start_tick_cnt(0) +{ + + cuList.resize(n_cu); + + for (int i = 0; i < n_cu; ++i) { + cuList[i] = p->CUs[i]; + assert(i == cuList[i]->cu_id); + cuList[i]->shader = this; + } +} + +Addr +Shader::mmap(int length) +{ + + Addr start; + + // round up length to the next page + length = roundUp(length, TheISA::PageBytes); + + if (X86Linux64::mmapGrowsDown()) { + DPRINTF(HSAIL, "GROWS DOWN"); + start = gpuTc->getProcessPtr()->mmap_end -length; + gpuTc->getProcessPtr()->mmap_end = start; + } else { + DPRINTF(HSAIL, "GROWS UP"); + start = gpuTc->getProcessPtr()->mmap_end; + gpuTc->getProcessPtr()->mmap_end += length; + + // assertion to make sure we don't overwrite the stack (it grows down) + assert(gpuTc->getProcessPtr()->mmap_end < + gpuTc->getProcessPtr()->stack_base - + gpuTc->getProcessPtr()->max_stack_size); + + } + + DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); + + gpuTc->getProcessPtr()->allocateMem(start,length); + + return start; +} + +void +Shader::init() +{ + // grab the threadContext of the thread running on the CPU + assert(cpuPointer); + gpuTc = cpuPointer->getContext(0); + assert(gpuTc); +} + +Shader::~Shader() +{ + for (int j = 0; j < n_cu; ++j) + delete cuList[j]; +} + +void +Shader::updateThreadContext(int tid) { + // thread context of the thread which dispatched work + assert(cpuPointer); + gpuTc = cpuPointer->getContext(tid); + assert(gpuTc); +} + +void +Shader::hostWakeUp(BaseCPU *cpu) { + if (cpuPointer == cpu) { + if (gpuTc->status() == ThreadContext::Suspended) + cpu->activateContext(gpuTc->threadId()); + } else { + //Make sure both dispatcher and shader are trying to + //wakeup same host. Hack here to enable kernel launch + //from multiple CPUs + panic("Dispatcher wants to wakeup a different host"); + } +} + +Shader* +ShaderParams::create() +{ + return new Shader(this); +} + +void +Shader::exec() +{ + tick_cnt = curTick(); + box_tick_cnt = curTick() - start_tick_cnt; + + // apply any scheduled adds + for (int i = 0; i < sa_n; ++i) { + if (sa_when[i] <= tick_cnt) { + *sa_val[i] += sa_x[i]; + sa_val.erase(sa_val.begin() + i); + sa_x.erase(sa_x.begin() + i); + sa_when.erase(sa_when.begin() + i); + --sa_n; + --i; + } + } + + // clock all of the cu's + for (int i = 0; i < n_cu; ++i) + cuList[i]->exec(); +} + +bool +Shader::dispatch_workgroups(NDRange *ndr) +{ + bool scheduledSomething = false; + int cuCount = 0; + int curCu = nextSchedCu; + + while (cuCount < n_cu) { + //Every time we try a CU, update nextSchedCu + nextSchedCu = (nextSchedCu + 1) % n_cu; + + // dispatch workgroup iff the following two conditions are met: + // (a) wg_rem is true - there are unassigned workgroups in the grid + // (b) there are enough free slots in cu cuList[i] for this wg + if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { + scheduledSomething = true; + DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); + + // ticks() member function translates cycles to simulation ticks. + if (!tickEvent.scheduled()) { + schedule(tickEvent, curTick() + this->ticks(1)); + } + + cuList[curCu]->StartWorkgroup(ndr); + ndr->wgId[0]++; + ndr->globalWgId++; + if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { + ndr->wgId[0] = 0; + ndr->wgId[1]++; + + if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { + ndr->wgId[1] = 0; + ndr->wgId[2]++; + + if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { + ndr->wg_disp_rem = false; + break; + } + } + } + } + + ++cuCount; + curCu = nextSchedCu; + } + + return scheduledSomething; +} + +void +Shader::handshake(GpuDispatcher *_dispatcher) +{ + dispatcher = _dispatcher; +} + +void +Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id) +{ + unsigned block_size = RubySystem::getBlockSizeBytes(); + unsigned size = req->getSize(); + + Addr tmp_addr; + BaseTLB::Mode trans_mode; + + if (cmd == MemCmd::ReadReq) { + trans_mode = BaseTLB::Read; + } else if (cmd == MemCmd::WriteReq) { + trans_mode = BaseTLB::Write; + } else { + fatal("unexcepted MemCmd\n"); + } + + tmp_addr = req->getVaddr(); + Addr split_addr = roundDown(tmp_addr + size - 1, block_size); + + assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size); + + // Misaligned access + if (split_addr > tmp_addr) { + RequestPtr req1, req2; + req->splitOnVaddr(split_addr, req1, req2); + + + PacketPtr pkt1 = new Packet(req2, cmd); + PacketPtr pkt2 = new Packet(req1, cmd); + + functionalTLBAccess(pkt1, cu_id, trans_mode); + functionalTLBAccess(pkt2, cu_id, trans_mode); + + PacketPtr new_pkt1 = new Packet(pkt1->req, cmd); + PacketPtr new_pkt2 = new Packet(pkt2->req, cmd); + + new_pkt1->dataStatic(data); + new_pkt2->dataStatic((uint8_t*)data + req1->getSize()); + + if (suppress_func_errors) { + new_pkt1->setSuppressFuncError(); + new_pkt2->setSuppressFuncError(); + } + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt1); + cuList[0]->memPort[0]->sendFunctional(new_pkt2); + + delete new_pkt1; + delete new_pkt2; + delete pkt1; + delete pkt2; + } else { + PacketPtr pkt = new Packet(req, cmd); + functionalTLBAccess(pkt, cu_id, trans_mode); + PacketPtr new_pkt = new Packet(pkt->req, cmd); + new_pkt->dataStatic(data); + + if (suppress_func_errors) { + new_pkt->setSuppressFuncError(); + }; + + // fixme: this should be cuList[cu_id] if cu_id != n_cu + // The latter requires a memPort in the dispatcher + cuList[0]->memPort[0]->sendFunctional(new_pkt); + + delete new_pkt; + delete pkt; + } +} + +bool +Shader::busy() +{ + for (int i_cu = 0; i_cu < n_cu; ++i_cu) { + if (!cuList[i_cu]->isDone()) { + return true; + } + } + + return false; +} + +void +Shader::ScheduleAdd(uint32_t *val,Tick when,int x) +{ + sa_val.push_back(val); + sa_when.push_back(tick_cnt + when); + sa_x.push_back(x); + ++sa_n; +} + +Shader::TickEvent::TickEvent(Shader *_shader) + : Event(CPU_Tick_Pri), shader(_shader) +{ +} + + +void +Shader::TickEvent::process() +{ + if (shader->busy()) { + shader->exec(); + shader->schedule(this, curTick() + shader->ticks(1)); + } +} + +const char* +Shader::TickEvent::description() const +{ + return "Shader tick"; +} + +void +Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors) +{ + uint8_t *data_buf = (uint8_t*)ptr; + + for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes()); + !gen.done(); gen.next()) { + Request *req = new Request(0, gen.addr(), gen.size(), 0, + cuList[0]->masterId(), 0, 0, 0); + + doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id); + data_buf += gen.size(); + delete req; + } +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false); +} + +void +Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); +} + +void +Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false); +} + +void +Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + bool suppress_func_errors) +{ + AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, + suppress_func_errors); +} + +/* + * Send a packet through the appropriate TLB functional port. + * If cu_id=n_cu, then this is the dispatcher's TLB. + * Otherwise it's the TLB of the cu_id compute unit. + */ +void +Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) +{ + // update senderState. Need to know the gpuTc and the TLB mode + pkt->senderState = + new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); + + if (cu_id == n_cu) { + dispatcher->tlbPort->sendFunctional(pkt); + } else { + // even when the perLaneTLB flag is turned on + // it's ok tp send all accesses through lane 0 + // since the lane # is not known here, + // This isn't important since these are functional accesses. + cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); + } + + /* safe_cast the senderState */ + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + delete sender_state->tlbEntry; + delete pkt->senderState; +} diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh new file mode 100644 index 000000000..91ea8aae0 --- /dev/null +++ b/src/gpu-compute/shader.hh @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Steve Reinhardt + */ + +#ifndef __SHADER_HH__ +#define __SHADER_HH__ + +#include <functional> +#include <string> + +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "base/types.hh" +#include "cpu/simple/atomic.hh" +#include "cpu/simple/timing.hh" +#include "cpu/simple_thread.hh" +#include "cpu/thread_context.hh" +#include "cpu/thread_state.hh" +#include "enums/MemOpType.hh" +#include "enums/MemType.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/qstruct.hh" +#include "mem/page_table.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/Shader.hh" +#include "sim/faults.hh" +#include "sim/process.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class GpuDispatcher; + +namespace TheISA +{ + class GpuTLB; +} + +static const int LDS_SIZE = 65536; + +// Class Shader: This describes a single shader instance. Most +// configurations will only have a single shader. + +class Shader : public SimObject +{ + protected: + // Shader's clock period in terms of number of ticks of curTime, + // aka global simulation clock + Tick clock; + + public: + typedef ShaderParams Params; + enum hsail_mode_e {SIMT,VECTOR_SCALAR}; + + // clock related functions ; maps to-and-from + // Simulation ticks and shader clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + + Tick getClock() const { return clock; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + + SimpleThread *cpuThread; + ThreadContext *gpuTc; + BaseCPU *cpuPointer; + + class TickEvent : public Event + { + private: + Shader *shader; + + public: + TickEvent(Shader*); + void process(); + const char* description() const; + }; + + TickEvent tickEvent; + + // is this simulation going to be timing mode in the memory? + bool timingSim; + hsail_mode_e hsail_mode; + + // If set, issue acq packet @ kernel launch + int impl_kern_boundary_sync; + // If set, generate a separate packet for acquire/release on + // ld_acquire/st_release/atomic operations + int separate_acquire_release; + // If set, fetch returns may be coissued with instructions + int coissue_return; + // If set, always dump all 64 gprs to trace + int trace_vgpr_all; + // Number of cu units in the shader + int n_cu; + // Number of wavefront slots per cu + int n_wf; + // The size of global memory + int globalMemSize; + + /* + * Bytes/work-item for call instruction + * The number of arguments for an hsail function will + * vary. We simply determine the maximum # of arguments + * required by any hsail function up front before the + * simulation (during parsing of the Brig) and record + * that number here. + */ + int funcargs_size; + + // Tracks CU that rr dispatcher should attempt scheduling + int nextSchedCu; + + // Size of scheduled add queue + uint32_t sa_n; + + // Pointer to value to be increments + std::vector<uint32_t*> sa_val; + // When to do the increment + std::vector<uint64_t> sa_when; + // Amount to increment by + std::vector<int32_t> sa_x; + + // List of Compute Units (CU's) + std::vector<ComputeUnit*> cuList; + + uint64_t tick_cnt; + uint64_t box_tick_cnt; + uint64_t start_tick_cnt; + + GpuDispatcher *dispatcher; + + Shader(const Params *p); + ~Shader(); + virtual void init(); + + // Run shader + void exec(); + + // Check to see if shader is busy + bool busy(); + + // Schedule a 32-bit value to be incremented some time in the future + void ScheduleAdd(uint32_t *val, Tick when, int x); + bool processTimingPacket(PacketPtr pkt); + + void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, + MemCmd cmd, bool suppress_func_errors); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id); + + void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id, + bool suppress_func_errors); + + void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data, + bool suppress_func_errors, int cu_id); + + void + registerCU(int cu_id, ComputeUnit *compute_unit) + { + cuList[cu_id] = compute_unit; + } + + void handshake(GpuDispatcher *dispatcher); + bool dispatch_workgroups(NDRange *ndr); + Addr mmap(int length); + void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); + void updateThreadContext(int tid); + void hostWakeUp(BaseCPU *cpu); +}; + +#endif // __SHADER_HH__ diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc new file mode 100644 index 000000000..0e35ab9cc --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.cc @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/simple_pool_manager.hh" + +#include "base/misc.hh" + +// return the min number of elements that the manager can reserve given +// a request for "size" elements +uint32_t +SimplePoolManager::minAllocatedElements(uint32_t size) +{ + fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n", + size); + + return size % minAllocation() > 0 ? + (minAllocation() - (size % minAllocation())) + size : size; +} + +std::string +SimplePoolManager::printRegion() +{ + std::string _cout; + if (_reservedGroups == 0) + _cout = "VRF is empty\n"; + else if (_reservedGroups > 0) { + uint32_t reservedEntries = _reservedGroups * _regionSize; + _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n"; + } + + return _cout; +} + +bool +SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size) +{ + assert(numRegions * minAllocatedElements(size) <= poolSize()); + + return _reservedGroups == 0; +} + +void +SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx) +{ + assert(_reservedGroups > 0); + --_reservedGroups; + + if (!_reservedGroups) + _nxtFreeIdx = 0; +} + +uint32_t +SimplePoolManager::allocateRegion(const uint32_t size, + uint32_t *reservedPoolSize) +{ + uint32_t actualSize = minAllocatedElements(size); + uint32_t startIdx = _nxtFreeIdx; + _nxtFreeIdx += actualSize; + _regionSize = actualSize; + assert(_nxtFreeIdx < poolSize()); + *reservedPoolSize = actualSize; + ++_reservedGroups; + + return startIdx; +} + +uint32_t +SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> ®ion) +{ + bool wrapAround = (region.first > region.second); + if (!wrapAround) { + return region.second - region.first + 1; + } else { + return region.second + poolSize() - region.first + 1; + } +} diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh new file mode 100644 index 000000000..1d4174da8 --- /dev/null +++ b/src/gpu-compute/simple_pool_manager.hh @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __SIMPLE_POOL_MANAGER_HH__ +#define __SIMPLE_POOL_MANAGER_HH__ + +#include <cassert> +#include <cstdint> + +#include "gpu-compute/pool_manager.hh" + +// Simple Pool Manager: allows one region per pool. No region merging is +// supported. +class SimplePoolManager : public PoolManager +{ + public: + SimplePoolManager(uint32_t minAlloc, uint32_t poolSize) + : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0), + _reservedGroups(0) + { + } + + uint32_t minAllocatedElements(uint32_t size); + std::string printRegion(); + bool canAllocate(uint32_t numRegions, uint32_t size); + uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize); + void freeRegion(uint32_t firstIdx, uint32_t lastIdx); + uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion); + + private: + // actual size of a region (normalized to the minimum size that can + // be reserved) + uint32_t _regionSize; + // next index to allocate a region + uint8_t _nxtFreeIdx; + // number of groups that reserve a region + uint32_t _reservedGroups; +}; + +#endif // __SIMPLE_POOL_MANAGER_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc new file mode 100644 index 000000000..835d7b740 --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.cc @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/tlb_coalescer.hh" + +#include <cstring> + +#include "debug/GPUTLB.hh" + +TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), + clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), + coalescingWindow(p->coalescingWindow), + disableCoalescing(p->disableCoalescing), probeTLBEvent(this), + cleanupEvent(this) +{ + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } +} + +BaseSlavePort& +TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) +{ + if (if_name == "slave") { + if (idx >= static_cast<PortID>(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } +} + +BaseMasterPort& +TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "master") { + if (idx >= static_cast<PortID>(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } +} + +/* + * This method returns true if the <incoming_pkt> + * can be coalesced with <coalesced_pkt> and false otherwise. + * A given set of rules is checked. + * The rules can potentially be modified based on the TLB level. + */ +bool +TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) +{ + if (disableCoalescing) + return false; + + TheISA::GpuTLB::TranslationState *incoming_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); + + TheISA::GpuTLB::TranslationState *coalesced_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); + + // Rule 1: Coalesce requests only if they + // fall within the same virtual page + Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), + TheISA::PageBytes); + + Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), + TheISA::PageBytes); + + if (incoming_virt_page_addr != coalesced_virt_page_addr) + return false; + + //* Rule 2: Coalesce requests only if they + // share a TLB Mode, i.e. they are both read + // or write requests. + BaseTLB::Mode incoming_mode = incoming_state->tlbMode; + BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; + + if (incoming_mode != coalesced_mode) + return false; + + // when we can coalesce a packet update the reqCnt + // that is the number of packets represented by + // this coalesced packet + if (!incoming_state->prefetch) + coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); + + return true; +} + +/* + * We need to update the physical addresses of all the translation requests + * that were coalesced into the one that just returned. + */ +void +TLBCoalescer::updatePhysAddresses(PacketPtr pkt) +{ + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + + DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", + issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; + assert(tlb_entry); + Addr first_entry_vaddr = tlb_entry->vaddr; + Addr first_entry_paddr = tlb_entry->paddr; + int page_size = tlb_entry->size(); + bool uncacheable = tlb_entry->uncacheable; + int first_hit_level = sender_state->hitLevel; + bool valid = tlb_entry->valid; + + // Get the physical page address of the translated request + // Using the page_size specified in the TLBEntry allows us + // to support different page sizes. + Addr phys_page_paddr = pkt->req->getPaddr(); + phys_page_paddr &= ~(page_size - 1); + + for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { + PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>( + local_pkt->senderState); + + // we are sending the packet back, so pop the reqCnt associated + // with this level in the TLB hiearchy + if (!sender_state->prefetch) + sender_state->reqCnt.pop_back(); + + /* + * Only the first packet from this coalesced request has been + * translated. Grab the translated phys. page addr and update the + * physical addresses of the remaining packets with the appropriate + * page offsets. + */ + if (i) { + Addr paddr = phys_page_paddr; + paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); + local_pkt->req->setPaddr(paddr); + + if (uncacheable) + local_pkt->req->setFlags(Request::UNCACHEABLE); + + // update senderState->tlbEntry, so we can insert + // the correct TLBEentry in the TLBs above. + sender_state->tlbEntry = + new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, + valid); + + // update the hitLevel for all uncoalesced reqs + // so that each packet knows where it hit + // (used for statistics in the CUs) + sender_state->hitLevel = first_hit_level; + } + + SlavePort *return_port = sender_state->ports.back(); + sender_state->ports.pop_back(); + + // Translation is done - Convert to a response pkt if necessary and + // send the translation back + if (local_pkt->isRequest()) { + local_pkt->makeTimingResponse(); + } + + return_port->sendTimingResp(local_pkt); + } + + // schedule clean up for end of this cycle + // This is a maximum priority event and must be on + // the same cycle as GPUTLB cleanup event to prevent + // race conditions with an IssueProbeEvent caused by + // MemSidePort::recvReqRetry + cleanupQueue.push(virt_page_addr); + + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); +} + +// Receive translation requests, create a coalesced request, +// and send them to the TLB (TLBProbesPerCycle) +bool +TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) +{ + // first packet of a coalesced request + PacketPtr first_packet = nullptr; + // true if we are able to do coalescing + bool didCoalesce = false; + // number of coalesced reqs for a given window + int coalescedReq_cnt = 0; + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + // push back the port to remember the path back + sender_state->ports.push_back(this); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) { + // if reqCnt is empty then this packet does not represent + // multiple uncoalesced reqs(pkts) but just a single pkt. + // If it does though then the reqCnt for each level in the + // hierarchy accumulates the total number of reqs this packet + // represents + int req_cnt = 1; + + if (!sender_state->reqCnt.empty()) + req_cnt = sender_state->reqCnt.back(); + + sender_state->reqCnt.push_back(req_cnt); + + // update statistics + coalescer->uncoalescedAccesses++; + req_cnt = sender_state->reqCnt.back(); + DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); + coalescer->queuingCycles -= (curTick() * req_cnt); + coalescer->localqueuingCycles -= curTick(); + } + + // FIXME if you want to coalesce not based on the issueTime + // of the packets (i.e., from the compute unit's perspective) + // but based on when they reached this coalescer then + // remove the following if statement and use curTick() or + // coalescingWindow for the tick_index. + if (!sender_state->issueTime) + sender_state->issueTime = curTick(); + + // The tick index is used as a key to the coalescerFIFO hashmap. + // It is shared by all candidates that fall within the + // given coalescingWindow. + int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; + + if (coalescer->coalescerFIFO.count(tick_index)) { + coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); + } + + // see if we can coalesce the incoming pkt with another + // coalesced request with the same tick_index + for (int i = 0; i < coalescedReq_cnt; ++i) { + first_packet = coalescer->coalescerFIFO[tick_index][i][0]; + + if (coalescer->canCoalesce(pkt, first_packet)) { + coalescer->coalescerFIFO[tick_index][i].push_back(pkt); + + DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", + i, tick_index, + coalescer->coalescerFIFO[tick_index][i].size()); + + didCoalesce = true; + break; + } + } + + // if this is the first request for this tick_index + // or we did not manage to coalesce, update stats + // and make necessary allocations. + if (!coalescedReq_cnt || !didCoalesce) { + if (update_stats) + coalescer->coalescedAccesses++; + + std::vector<PacketPtr> new_array; + new_array.push_back(pkt); + coalescer->coalescerFIFO[tick_index].push_back(new_array); + + DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " + "push\n", tick_index, + coalescer->coalescerFIFO[tick_index].size()); + } + + //schedule probeTLBEvent next cycle to send the + //coalesced requests to the TLB + if (!coalescer->probeTLBEvent.scheduled()) { + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); + } + + return true; +} + +void +TLBCoalescer::CpuSidePort::recvReqRetry() +{ + assert(false); +} + +void +TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) +{ + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) + coalescer->uncoalescedAccesses++; + + // If there is a pending timing request for this virtual address + // print a warning message. This is a temporary caveat of + // the current simulator where atomic and timing requests can + // coexist. FIXME remove this check/warning in the future. + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (map_count) { + DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " + "req. pending\n", virt_page_addr); + } + + coalescer->memSidePort[0]->sendFunctional(pkt); +} + +AddrRangeList +TLBCoalescer::CpuSidePort::getAddrRanges() const +{ + // currently not checked by the master + AddrRangeList ranges; + + return ranges; +} + +bool +TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) +{ + // a translation completed and returned + coalescer->updatePhysAddresses(pkt); + + return true; +} + +void +TLBCoalescer::MemSidePort::recvReqRetry() +{ + //we've receeived a retry. Schedule a probeTLBEvent + if (!coalescer->probeTLBEvent.scheduled()) + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); +} + +void +TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); +} + +TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) + : Event(CPU_Tick_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::IssueProbeEvent::description() const +{ + return "Probe the TLB below"; +} + +/* + * Here we scan the coalescer FIFO and issue the max + * number of permitted probes to the TLB below. We + * permit bypassing of coalesced requests for the same + * tick_index. + * + * We do not access the next tick_index unless we've + * drained the previous one. The coalesced requests + * that are successfully sent are moved to the + * issuedTranslationsTable table (the table which keeps + * track of the outstanding reqs) + */ +void +TLBCoalescer::IssueProbeEvent::process() +{ + // number of TLB probes sent so far + int sent_probes = 0; + // rejected denotes a blocking event + bool rejected = false; + + // It is set to true either when the recvTiming of the TLB below + // returns false or when there is another outstanding request for the + // same virt. page. + + DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); + + for (auto iter = coalescer->coalescerFIFO.begin(); + iter != coalescer->coalescerFIFO.end() && !rejected; ) { + int coalescedReq_cnt = iter->second.size(); + int i = 0; + int vector_index = 0; + + DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", + coalescedReq_cnt, iter->first); + + while (i < coalescedReq_cnt) { + ++i; + PacketPtr first_packet = iter->second[vector_index][0]; + + // compute virtual page address for this request + Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), + TheISA::PageBytes); + + // is there another outstanding request for the same page addr? + int pending_reqs = + coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (pending_reqs) { + DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " + "page %#x\n", virt_page_addr); + + ++vector_index; + rejected = true; + + continue; + } + + // send the coalesced request for virt_page_addr + if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { + DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", + virt_page_addr); + + // No need for a retries queue since we are already buffering + // the coalesced request in coalescerFIFO. + rejected = true; + ++vector_index; + } else { + TheISA::GpuTLB::TranslationState *tmp_sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*> + (first_packet->senderState); + + bool update_stats = !tmp_sender_state->prefetch; + + if (update_stats) { + // req_cnt is total number of packets represented + // by the one we just sent counting all the way from + // the top of TLB hiearchy (i.e., from the CU) + int req_cnt = tmp_sender_state->reqCnt.back(); + coalescer->queuingCycles += (curTick() * req_cnt); + + DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", + coalescer->name(), req_cnt); + + // pkt_cnt is number of packets we coalesced into the one + // we just sent but only at this coalescer level + int pkt_cnt = iter->second[vector_index].size(); + coalescer->localqueuingCycles += (curTick() * pkt_cnt); + } + + DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", + virt_page_addr); + + //copy coalescedReq to issuedTranslationsTable + coalescer->issuedTranslationsTable[virt_page_addr] + = iter->second[vector_index]; + + //erase the entry of this coalesced req + iter->second.erase(iter->second.begin() + vector_index); + + if (iter->second.empty()) + assert(i == coalescedReq_cnt); + + sent_probes++; + if (sent_probes == coalescer->TLBProbesPerCycle) + return; + } + } + + //if there are no more coalesced reqs for this tick_index + //erase the hash_map with the first iterator + if (iter->second.empty()) { + coalescer->coalescerFIFO.erase(iter++); + } else { + ++iter; + } + } +} + +TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) + : Event(Maximum_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::CleanupEvent::description() const +{ + return "Cleanup issuedTranslationsTable hashmap"; +} + +void +TLBCoalescer::CleanupEvent::process() +{ + while (!coalescer->cleanupQueue.empty()) { + Addr cleanup_addr = coalescer->cleanupQueue.front(); + coalescer->cleanupQueue.pop(); + coalescer->issuedTranslationsTable.erase(cleanup_addr); + + DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", + cleanup_addr); + } +} + +void +TLBCoalescer::regStats() +{ + uncoalescedAccesses + .name(name() + ".uncoalesced_accesses") + .desc("Number of uncoalesced TLB accesses") + ; + + coalescedAccesses + .name(name() + ".coalesced_accesses") + .desc("Number of coalesced TLB accesses") + ; + + queuingCycles + .name(name() + ".queuing_cycles") + .desc("Number of cycles spent in queue") + ; + + localqueuingCycles + .name(name() + ".local_queuing_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over all incoming pkts") + ; + + localLatency = localqueuingCycles / uncoalescedAccesses; +} + + +TLBCoalescer* +TLBCoalescerParams::create() +{ + return new TLBCoalescer(this); +} + diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh new file mode 100644 index 000000000..09210148b --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.hh @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __TLB_COALESCER_HH__ +#define __TLB_COALESCER_HH__ + +#include <list> +#include <queue> +#include <string> +#include <vector> + +#include "arch/generic/tlb.hh" +#include "arch/isa.hh" +#include "arch/isa_traits.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/regs/segment.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/gpu_tlb.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/TLBCoalescer.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +/** + * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of + * each TLB. It receives packets and issues coalesced requests to the + * TLB below it. It controls how requests are coalesced (the rules) + * and the permitted number of TLB probes per cycle (i.e., how many + * coalesced requests it feeds the TLB per cycle). + */ +class TLBCoalescer : public MemObject +{ + protected: + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the + // python config files. + int clock; + + public: + typedef TLBCoalescerParams Params; + TLBCoalescer(const Params *p); + ~TLBCoalescer() { } + + // Number of TLB probes per cycle. Parameterizable - default 2. + int TLBProbesPerCycle; + + // Consider coalescing across that many ticks. + // Paraemterizable - default 1. + int coalescingWindow; + + // Each coalesced request consists of multiple packets + // that all fall within the same virtual page + typedef std::vector<PacketPtr> coalescedReq; + + // disables coalescing when true + bool disableCoalescing; + + /* + * This is a hash map with <tick_index> as a key. + * It contains a vector of coalescedReqs per <tick_index>. + * Requests are buffered here until they can be issued to + * the TLB, at which point they are copied to the + * issuedTranslationsTable hash map. + * + * In terms of coalescing, we coalesce requests in a given + * window of x cycles by using tick_index = issueTime/x as a + * key, where x = coalescingWindow. issueTime is the issueTime + * of the pkt from the ComputeUnit's perspective, but another + * option is to change it to curTick(), so we coalesce based + * on the receive time. + */ + typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO; + + CoalescingFIFO coalescerFIFO; + + /* + * issuedTranslationsTabler: a hash_map indexed by virtual page + * address. Each hash_map entry has a vector of PacketPtr associated + * with it denoting the different packets that share an outstanding + * coalesced translation request for the same virtual page. + * + * The rules that determine which requests we can coalesce are + * specified in the canCoalesce() method. + */ + typedef std::unordered_map<Addr, coalescedReq> CoalescingTable; + + CoalescingTable issuedTranslationsTable; + + // number of packets the coalescer receives + Stats::Scalar uncoalescedAccesses; + // number packets the coalescer send to the TLB + Stats::Scalar coalescedAccesses; + + // Number of cycles the coalesced requests spend waiting in + // coalescerFIFO. For each packet the coalescer receives we take into + // account the number of all uncoalesced requests this pkt "represents" + Stats::Scalar queuingCycles; + + // On average how much time a request from the + // uncoalescedAccesses that reaches the TLB + // spends waiting? + Stats::Scalar localqueuingCycles; + // localqueuingCycles/uncoalescedAccesses + Stats::Formula localLatency; + + bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2); + void updatePhysAddresses(PacketPtr pkt); + void regStats(); + + // Clock related functions. Maps to-and-from + // Simulation ticks and object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() is not implemented in the TLB coalescer.\n"); + } + + virtual AddrRangeList getAddrRanges() const; + }; + + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer, + PortID _index) + : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer), + index(_index) { } + + std::deque<PacketPtr> retries; + + protected: + TLBCoalescer *coalescer; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + + virtual void + recvRespRetry() + { + fatal("recvRespRetry() not implemented in TLB coalescer"); + } + }; + + // Coalescer slave ports on the cpu Side + std::vector<CpuSidePort*> cpuSidePort; + // Coalescer master ports on the memory side + std::vector<MemSidePort*> memSidePort; + + BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx); + BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx); + + class IssueProbeEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + IssueProbeEvent(TLBCoalescer *_coalescer); + void process(); + const char *description() const; + }; + + // this event issues the TLB probes + IssueProbeEvent probeTLBEvent; + + // the cleanupEvent is scheduled after a TLBEvent triggers + // in order to free memory and do the required clean-up + class CleanupEvent : public Event + { + private: + TLBCoalescer *coalescer; + + public: + CleanupEvent(TLBCoalescer *_coalescer); + void process(); + const char* description() const; + }; + + // schedule cleanup + CleanupEvent cleanupEvent; + + // this FIFO queue keeps track of the virt. page + // addresses that are pending cleanup + std::queue<Addr> cleanupQueue; +}; + +#endif // __TLB_COALESCER_HH__ diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc new file mode 100644 index 000000000..8b7dc0691 --- /dev/null +++ b/src/gpu-compute/vector_register_file.cc @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_file.hh" + +#include <string> + +#include "base/misc.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" +#include "gpu-compute/wavefront.hh" +#include "params/VectorRegisterFile.hh" + +VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) + : SimObject(p), + manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)), + simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd), + vgprState(new VecRegisterState()) +{ + fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n"); + fatal_if(simdId < 0, "Illegal SIMD id for VRF"); + + fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not " + "multiple of VRF size\n"); + + busy.clear(); + busy.resize(numRegsPerSimd, 0); + nxtBusy.clear(); + nxtBusy.resize(numRegsPerSimd, 0); + + vgprState->init(numRegsPerSimd); +} + +void +VectorRegisterFile::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + vgprState->setParent(computeUnit); +} + +uint8_t +VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = nxtBusy.at(idx); + + if (operandSize > 4) { + status = status | (nxtBusy.at((idx + 1) % numRegs())); + } + + return status; +} + +uint8_t +VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const +{ + uint8_t status = busy.at(idx); + + if (operandSize > 4) { + status = status | (busy.at((idx + 1) % numRegs())); + } + + return status; +} + +void +VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + nxtBusy.at(regIdx) = value; + + if (operandSize > 4) { + nxtBusy.at((regIdx + 1) % numRegs()) = value; + } +} + +void +VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value) +{ + busy.at(regIdx) = value; + + if (operandSize > 4) { + busy.at((regIdx + 1) % numRegs()) = value; + } +} + +bool +VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const +{ + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i)) { + uint32_t vgprIdx = ii->getRegisterIndex(i); + uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1); + + if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + + if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + w->numTimesBlockedDueRAWDependencies++; + } + + return false; + } + } + } + + return true; +} + +void +VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) +{ + bool loadInstr = IS_OT_READ(ii->opType()); + bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + + bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); + + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + + // mark the destination vector register as busy + markReg(physReg, ii->getOperandSize(i), 1); + // clear the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 0); + + // FIXME: if we ever model correct timing behavior + // for load argument instructions then we should not + // set the destination register as busy now but when + // the data returns. Loads and Atomics should free + // their destination registers when the data returns, + // not now + if (!atomicInstr && !loadNoArgInstr) { + uint32_t pipeLen = ii->getOperandSize(i) <= 4 ? + computeUnit->spBypassLength() : + computeUnit->dpBypassLength(); + + // schedule an event for marking the register as ready + computeUnit->registerEvent(w->simdId, physReg, + ii->getOperandSize(i), + computeUnit->shader->tick_cnt + + computeUnit->shader->ticks(pipeLen), + 0); + } + } + } +} + +int +VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w, + std::vector<uint32_t> ®Vec, uint32_t operandSize, + uint64_t timestamp) +{ + int delay = 0; + + panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n", + regVec.size()); + + for (int i = 0; i < regVec.size(); ++i) { + // mark the destination VGPR as free when the timestamp expires + computeUnit->registerEvent(w->simdId, regVec[i], operandSize, + computeUnit->shader->tick_cnt + timestamp + + computeUnit->shader->ticks(delay), 0); + } + + return delay; +} + +void +VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii) +{ + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + uint32_t physReg = w->remap(ii->getRegisterIndex(i), + ii->getOperandSize(i), 1); + // set the in-flight status of the destination vector register + preMarkReg(physReg, ii->getOperandSize(i), 1); + } + } +} + +bool +VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +bool +VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType) +{ + bool ready = true; + + return ready; +} + +VectorRegisterFile* +VectorRegisterFileParams::create() +{ + return new VectorRegisterFile(this); +} diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh new file mode 100644 index 000000000..1cb011a1e --- /dev/null +++ b/src/gpu-compute/vector_register_file.hh @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_FILE_HH__ +#define __VECTOR_REGISTER_FILE_HH__ + +#include <list> + +#include "base/statistics.hh" +#include "base/types.hh" +#include "gpu-compute/vector_register_state.hh" +#include "sim/sim_object.hh" + +class ComputeUnit; +class Shader; +class SimplePoolManager; +class Wavefront; + +struct VectorRegisterFileParams; + +enum class VrfAccessType : uint8_t +{ + READ = 0x01, + WRITE = 0x02, + RD_WR = READ | WRITE +}; + +// Vector Register File +class VectorRegisterFile : public SimObject +{ + public: + VectorRegisterFile(const VectorRegisterFileParams *p); + + void setParent(ComputeUnit *_computeUnit); + + // Read a register + template<typename T> + T + read(int regIdx, int threadId=0) + { + T p0 = vgprState->read<T>(regIdx, threadId); + + return p0; + } + + // Write a register + template<typename T> + void + write(int regIdx, T value, int threadId=0) + { + vgprState->write<T>(regIdx, value, threadId); + } + + uint8_t regBusy(int idx, uint32_t operandSize) const; + uint8_t regNxtBusy(int idx, uint32_t operandSize) const; + + int numRegs() const { return numRegsPerSimd; } + + void markReg(int regIdx, uint32_t operandSize, uint8_t value); + void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value); + + virtual void exec(GPUDynInstPtr ii, Wavefront *w); + + virtual int exec(uint64_t dynamic_id, Wavefront *w, + std::vector<uint32_t> ®Vec, uint32_t operandSize, + uint64_t timestamp); + + bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const; + virtual void updateEvents() { } + virtual void updateResources(Wavefront *w, GPUDynInstPtr ii); + + virtual bool + isReadConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool + isWriteConflict(int memWfId, int exeWfId) const + { + return false; + } + + virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, + GPUDynInstPtr ii, + VrfAccessType accessType); + + virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, + VrfAccessType accessType); + + SimplePoolManager *manager; + + protected: + ComputeUnit* computeUnit; + int simdId; + + // flag indicating if a register is busy + std::vector<uint8_t> busy; + // flag indicating if a register will be busy (by instructions + // in the SIMD pipeline) + std::vector<uint8_t> nxtBusy; + + // numer of registers (bank size) per simd unit (bank) + int numRegsPerSimd; + + // vector register state + VecRegisterState *vgprState; +}; + +#endif // __VECTOR_REGISTER_FILE_HH__ diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc new file mode 100644 index 000000000..f231b0579 --- /dev/null +++ b/src/gpu-compute/vector_register_state.cc @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#include "gpu-compute/vector_register_state.hh" + +#include "gpu-compute/compute_unit.hh" + +VecRegisterState::VecRegisterState() : computeUnit(nullptr) +{ + s_reg.clear(); + d_reg.clear(); +} + +void +VecRegisterState::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; + _name = computeUnit->name() + ".VecRegState"; +} + +void +VecRegisterState::init(uint32_t _size) +{ + s_reg.resize(_size); + d_reg.resize(_size); +} diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh new file mode 100644 index 000000000..a233b9acc --- /dev/null +++ b/src/gpu-compute/vector_register_state.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos + */ + +#ifndef __VECTOR_REGISTER_STATE_HH__ +#define __VECTOR_REGISTER_STATE_HH__ + +#include <array> +#include <cassert> +#include <string> +#include <vector> + +#include "gpu-compute/misc.hh" + +class ComputeUnit; + +// Vector Register State per SIMD unit (contents of the vector +// registers in the VRF of the SIMD) +class VecRegisterState +{ + public: + VecRegisterState(); + void init(uint32_t _size); + + const std::string& name() const { return _name; } + void setParent(ComputeUnit *_computeUnit); + void regStats() { } + + // Access methods + template<typename T> + T + read(int regIdx, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + return *p0; + } + + template<typename T> + void + write(unsigned int regIdx, T value, int threadId=0) { + T *p0; + assert(sizeof(T) == 4 || sizeof(T) == 8); + if (sizeof(T) == 4) { + p0 = (T*)(&s_reg[regIdx][threadId]); + } else { + p0 = (T*)(&d_reg[regIdx][threadId]); + } + + *p0 = value; + } + + // (Single Precision) Vector Register File size. + int regSize() { return s_reg.size(); } + + private: + ComputeUnit *computeUnit; + std::string _name; + // 32-bit Single Precision Vector Register State + std::vector<std::array<uint32_t, VSZ>> s_reg; + // 64-bit Double Precision Vector Register State + std::vector<std::array<uint64_t, VSZ>> d_reg; +}; + +#endif // __VECTOR_REGISTER_STATE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc new file mode 100644 index 000000000..0aa033db1 --- /dev/null +++ b/src/gpu-compute/wavefront.cc @@ -0,0 +1,925 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/wavefront.hh" + +#include "debug/GPUExec.hh" +#include "debug/WavefrontStack.hh" +#include "gpu-compute/code_enums.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" + +Wavefront* +WavefrontParams::create() +{ + return new Wavefront(this); +} + +Wavefront::Wavefront(const Params *p) + : SimObject(p), callArgMem(nullptr) +{ + last_trace = 0; + simdId = p->simdId; + wfSlotId = p->wf_slot_id; + + status = S_STOPPED; + reservedVectorRegs = 0; + startVgprIndex = 0; + outstanding_reqs = 0; + mem_reqs_in_pipe = 0; + outstanding_reqs_wr_gm = 0; + outstanding_reqs_wr_lm = 0; + outstanding_reqs_rd_gm = 0; + outstanding_reqs_rd_lm = 0; + rd_lm_reqs_in_pipe = 0; + rd_gm_reqs_in_pipe = 0; + wr_lm_reqs_in_pipe = 0; + wr_gm_reqs_in_pipe = 0; + + barrier_cnt = 0; + old_barrier_cnt = 0; + stalledAtBarrier = false; + + mem_trace_busy = 0; + old_vgpr_tcnt = 0xffffffffffffffffll; + old_dgpr_tcnt = 0xffffffffffffffffll; + + pendingFetch = false; + dropFetch = false; + condRegState = new ConditionRegisterState(); + maxSpVgprs = 0; + maxDpVgprs = 0; +} + +void +Wavefront::regStats() +{ + srcRegOpDist + .init(0, 4, 2) + .name(name() + ".src_reg_operand_dist") + .desc("number of executed instructions with N source register operands") + ; + + dstRegOpDist + .init(0, 3, 2) + .name(name() + ".dst_reg_operand_dist") + .desc("number of executed instructions with N destination register " + "operands") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueWAXDependencies + .name(name() + ".timesBlockedDueWAXDependencies") + .desc("number of times the wf's instructions are blocked due to WAW " + "or WAR dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueRAWDependencies + .name(name() + ".timesBlockedDueRAWDependencies") + .desc("number of times the wf's instructions are blocked due to RAW " + "dependencies") + ; + + // FIXME: the name of the WF needs to be unique + numTimesBlockedDueVrfPortAvail + .name(name() + ".timesBlockedDueVrfPortAvail") + .desc("number of times instructions are blocked due to VRF port " + "availability") + ; +} + +void +Wavefront::init() +{ + reservedVectorRegs = 0; + startVgprIndex = 0; +} + +void +Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) +{ + condRegState->init(num_cregs); + maxSpVgprs = num_sregs; + maxDpVgprs = num_dregs; +} + +Wavefront::~Wavefront() +{ + if (callArgMem) + delete callArgMem; +} + +void +Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) +{ + wfDynId = _wfDynId; + base_ptr = _base_ptr; + status = S_RUNNING; +} + +bool +Wavefront::isGmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType())) { + return true; + } + + if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType())) { + + return true; + } + + if (IS_OT_FLAT(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isLmInstruction(GPUDynInstPtr ii) +{ + if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || + IS_OT_ATOMIC_LM(ii->opType())) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstALU() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ)) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstBarrier() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstGMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstLMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstPrivMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstFlatMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { + + return true; + } + + return false; +} + +// Return true if the Wavefront's instruction +// buffer has branch instruction. +bool +Wavefront::instructionBufferHasBranch() +{ + for (auto it : instructionBuffer) { + GPUDynInstPtr ii = it; + + if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + return true; + } + } + + return false; +} + +// Remap HSAIL register to physical VGPR. +// HSAIL register = virtual register assigned to an operand by HLC compiler +uint32_t +Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) +{ + assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); + // add the offset from where the VGPRs of the wavefront have been assigned + uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; + // HSAIL double precision (DP) register: calculate the physical VGPR index + // assuming that DP registers are placed after SP ones in the VRF. The DP + // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust + // the DP VGPR index before mapping it to the physical VRF address space + if (mode == 1 && size > 4) { + physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); + } + + assert((startVgprIndex <= physicalVgprIndex) && + (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); + + // calculate absolute physical VGPR index + return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); +} + +// Return true if this wavefront is ready +// to execute an instruction of the specified type. +int +Wavefront::ready(itype_e type) +{ + // Check to make sure wave is running + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return 0; + } + + // Is the wave waiting at a barrier + if (stalledAtBarrier) { + if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, + computeUnit->getRefCounter(dispatchid, wg_id))) { + // Are all threads at barrier? + return 0; + } + old_barrier_cnt = barrier_cnt; + stalledAtBarrier = false; + } + + // Read instruction + GPUDynInstPtr ii = instructionBuffer.front(); + + bool ready_inst M5_VAR_USED = false; + bool glbMemBusRdy = false; + bool glbMemIssueRdy = false; + if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { + for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { + if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) + glbMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + glbMemIssueRdy = true; + } + } + bool locMemBusRdy = false; + bool locMemIssueRdy = false; + if (type == I_SHARED) { + for (int j=0; j < computeUnit->numLocMemUnits; ++j) { + if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) + locMemBusRdy = true; + if (computeUnit->wfWait[j].prerdy()) + locMemIssueRdy = true; + } + } + + // The following code is very error prone and the entire process for + // checking readiness will be fixed eventually. In the meantime, let's + // make sure that we do not silently let an instruction type slip + // through this logic and always return not ready. + if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || + ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || + IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || + IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + panic("next instruction: %s is of unknown type\n", ii->disassemble()); + } + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", + computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); + + if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + // Here for ALU instruction (barrier) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + // Here for ALU instruction (nop) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + // Here for ALU instruction (return) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is wave slot free? + return 0; + } + + // Are there in pipe or outstanding memory requests? + if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { + return 0; + } + + ready_inst = true; + } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || + ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG)) { + // Here for ALU instruction (all others) + if (!computeUnit->wfWait[simdId].prerdy()) { + // Is alu slot free? + return 0; + } + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || + IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + // Here Global memory instruction + if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + // Are there in pipe or outstanding global memory write requests? + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || + IS_OT_HIST_GM(ii->opType())) { + // Are there in pipe or outstanding global memory read requests? + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) + return 0; + } + + if (!glbMemIssueRdy) { + // Is WV issue slot free? + return 0; + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || + IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + // Here for Shared memory instruction + if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || + IS_OT_HIST_LM(ii->opType())) { + if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + if (!locMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || + IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { + // Here for Private memory instruction ------------------------ // + if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { + if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || + IS_OT_HIST_PM(ii->opType())) { + if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { + return 0; + } + } + + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + if (!glbMemBusRdy) { + // Is there an available VRF->Global memory read bus? + return 0; + } + + if (!locMemBusRdy) { + // Is there an available VRF->LDS read bus? + return 0; + } + + if (!glbMemIssueRdy) { + // Is wave slot free? + return 0; + } + + if (!locMemIssueRdy) { + return 0; + } + if (!computeUnit->globalMemoryPipe. + isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { + // Can we insert a new request to the Global Mem Request FIFO? + return 0; + } + + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { + // Can we insert a new request to the LDS Request FIFO? + return 0; + } + // can we schedule source & destination operands on the VRF? + if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, + VrfAccessType::RD_WR)) { + return 0; + } + // are all the operands ready? (RAW, WAW and WAR depedencies met?) + if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { + return 0; + } + ready_inst = true; + } else { + return 0; + } + + assert(ready_inst); + + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, + simdId, wfSlotId, ii->disassemble()); + + return 1; +} + +void +Wavefront::updateResources() +{ + // Get current instruction + GPUDynInstPtr ii = instructionBuffer.front(); + assert(ii); + computeUnit->vrf[simdId]->updateResources(this, ii); + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType()==Enums::OT_KERN_READ || + ii->opType()==Enums::OT_ARG || + ii->opType()==Enums::OT_RET) { + computeUnit->aluPipe[simdId].preset(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].preset(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + if ( Enums::SC_SHARED == ii->executedAs() ) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + mem_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + mem_reqs_in_pipe++; + wr_lm_reqs_in_pipe++; + rd_lm_reqs_in_pipe++; + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_PM(ii->opType())) { + mem_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_PM(ii->opType())) { + mem_reqs_in_pipe++; + wr_gm_reqs_in_pipe++; + rd_gm_reqs_in_pipe++; + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + preset(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +void +Wavefront::exec() +{ + // ---- Exit if wavefront is inactive ----------------------------- // + + if (status == S_STOPPED || status == S_RETURNING || + instructionBuffer.empty()) { + return; + } + + // Get current instruction + + GPUDynInstPtr ii = instructionBuffer.front(); + + const uint32_t old_pc = pc(); + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " + "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, + ii->disassemble(), old_pc); + ii->execute(); + // access the VRF + computeUnit->vrf[simdId]->exec(ii, this); + srcRegOpDist.sample(ii->numSrcRegOperands()); + dstRegOpDist.sample(ii->numDstRegOperands()); + computeUnit->numInstrExecuted++; + computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - + computeUnit->lastExecCycle[simdId]); + computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); + if (pc() == old_pc) { + uint32_t new_pc = old_pc + 1; + // PC not modified by instruction, proceed to next or pop frame + pc(new_pc); + if (new_pc == rpc()) { + popFromReconvergenceStack(); + discardFetch(); + } else { + instructionBuffer.pop_front(); + } + } + + if (computeUnit->shader->hsail_mode==Shader::SIMT) { + const int num_active_lanes = execMask().count(); + computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); + computeUnit->numVecOpsExecuted += num_active_lanes; + if (isGmInstruction(ii)) { + computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); + } else if (isLmInstruction(ii)) { + computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); + } + } + + // ---- Update Vector ALU pipeline and other resources ------------------ // + // Single precision ALU or Branch or Return or Special instruction + if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || + ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + // FIXME: Kernel argument loads are currently treated as ALU operations + // since we don't send memory packets at execution. If we fix that then + // we should map them to one of the memory pipelines + ii->opType() == Enums::OT_KERN_READ || + ii->opType() == Enums::OT_ARG || + ii->opType() == Enums::OT_RET) { + computeUnit->aluPipe[simdId].set(computeUnit->shader-> + ticks(computeUnit->spBypassLength())); + + // this is to enforce a fixed number of cycles per issue slot per SIMD + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_BARRIER) { + computeUnit->wfWait[simdId].set(computeUnit->shader-> + ticks(computeUnit->issuePeriod)); + } else if (ii->opType() == Enums::OT_FLAT_READ) { + assert(Enums::SC_NONE != ii->executedAs()); + + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + assert(Enums::SC_NONE != ii->executedAs()); + if (Enums::SC_SHARED == ii->executedAs()) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } + } else if (IS_OT_READ_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_GM(ii->opType())) { + computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->GlbMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_READ_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(4)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_WRITE_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if (IS_OT_ATOMIC_LM(ii->opType())) { + computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. + set(computeUnit->shader->ticks(8)); + computeUnit->wfWait[computeUnit->ShrMemUnitId()]. + set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } +} + +bool +Wavefront::waitingAtBarrier(int lane) +{ + return bar_cnt[lane] < max_bar_cnt; +} + +void +Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& mask) +{ + assert(mask.count()); + reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); +} + +void +Wavefront::popFromReconvergenceStack() +{ + assert(!reconvergenceStack.empty()); + + DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + execMask().to_string<char, std::string::traits_type, + std::string::allocator_type>().c_str(), pc()); + + reconvergenceStack.pop(); + + DPRINTF(WavefrontStack, "%3i %s\n", pc(), + execMask().to_string<char, std::string::traits_type, + std::string::allocator_type>().c_str()); + +} + +void +Wavefront::discardFetch() +{ + instructionBuffer.clear(); + dropFetch |=pendingFetch; +} + +uint32_t +Wavefront::pc() const +{ + return reconvergenceStack.top()->pc; +} + +uint32_t +Wavefront::rpc() const +{ + return reconvergenceStack.top()->rpc; +} + +VectorMask +Wavefront::execMask() const +{ + return reconvergenceStack.top()->execMask; +} + +bool +Wavefront::execMask(int lane) const +{ + return reconvergenceStack.top()->execMask[lane]; +} + + +void +Wavefront::pc(uint32_t new_pc) +{ + reconvergenceStack.top()->pc = new_pc; +} diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh new file mode 100644 index 000000000..0abab8e83 --- /dev/null +++ b/src/gpu-compute/wavefront.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __WAVEFRONT_HH__ +#define __WAVEFRONT_HH__ + +#include <cassert> +#include <deque> +#include <memory> +#include <stack> +#include <vector> + +#include "base/misc.hh" +#include "base/types.hh" +#include "gpu-compute/condition_register_state.hh" +#include "gpu-compute/lds_state.hh" +#include "gpu-compute/misc.hh" +#include "params/Wavefront.hh" +#include "sim/sim_object.hh" + +static const int MAX_NUM_INSTS_PER_WF = 12; + +/* + * Arguments for the hsail opcode call, are user defined and variable length. + * The hardware/finalizer can support arguments in hardware or use memory to + * pass arguments. For now, let's assume that an unlimited number of arguments + * are supported in hardware (the compiler inlines functions whenver it can + * anyways, so unless someone is interested in the implications of linking/ + * library functions, I think this is a reasonable assumption given the typical + * size of an OpenCL kernel). + * + * Note that call args are different than kernel arguments: + * * All work-items in a kernel refer the same set of kernel arguments + * * Each work-item has it's on set of call args. So a call argument at + * address 0x4 is different for work-item 0 and work-item 1. + * + * Ok, the table below shows an example of how we organize the call arguments in + * the CallArgMem class. + * + * int foo(int arg1, double arg2) + * ___________________________________________________ + * | 0: return.0 | 4: return.1 | ... | 252: return.63 | + * |---------------------------------------------------| + * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | + * |---------------------------------------------------| + * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | + * ___________________________________________________ + */ +class CallArgMem +{ + public: + // pointer to buffer for storing function arguments + uint8_t *mem; + // size of function args + int funcArgsSizePerItem; + + template<typename CType> + int + getLaneOffset(int lane, int addr) + { + return addr * VSZ + sizeof(CType) * lane; + } + + CallArgMem(int func_args_size_per_item) + : funcArgsSizePerItem(func_args_size_per_item) + { + mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); + } + + ~CallArgMem() + { + free(mem); + } + + template<typename CType> + uint8_t* + getLaneAddr(int lane, int addr) + { + return mem + getLaneOffset<CType>(lane, addr); + } + + template<typename CType> + void + setLaneAddr(int lane, int addr, CType val) + { + *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; + } +}; + +/** + * A reconvergence stack entry conveys the necessary state to implement + * control flow divergence. + */ +class ReconvergenceStackEntry { + + public: + ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, + VectorMask new_mask) : pc(new_pc), rpc(new_rpc), + execMask(new_mask) { + } + + /** + * PC of current instruction. + */ + uint32_t pc; + /** + * PC of the immediate post-dominator instruction, i.e., the value of + * @a pc for the first instruction that will be executed by the wavefront + * when a reconvergence point is reached. + */ + uint32_t rpc; + /** + * Execution mask. + */ + VectorMask execMask; +}; + +class Wavefront : public SimObject +{ + public: + enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; + enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; + + // Base pointer for array of instruction pointers + uint64_t base_ptr; + + uint32_t old_barrier_cnt; + uint32_t barrier_cnt; + uint32_t barrier_id; + uint32_t barrier_slots; + status_e status; + // HW slot id where the WF is mapped to inside a SIMD unit + int wfSlotId; + int kern_id; + // SIMD unit where the WV has been scheduled + int simdId; + // pointer to parent CU + ComputeUnit *computeUnit; + + std::deque<GPUDynInstPtr> instructionBuffer; + + bool pendingFetch; + bool dropFetch; + + // Condition Register State (for HSAIL simulations only) + class ConditionRegisterState *condRegState; + // number of single precision VGPRs required by WF + uint32_t maxSpVgprs; + // number of double precision VGPRs required by WF + uint32_t maxDpVgprs; + // map virtual to physical vector register + uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); + void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + bool isGmInstruction(GPUDynInstPtr ii); + bool isLmInstruction(GPUDynInstPtr ii); + bool isOldestInstGMem(); + bool isOldestInstLMem(); + bool isOldestInstPrivMem(); + bool isOldestInstFlatMem(); + bool isOldestInstALU(); + bool isOldestInstBarrier(); + // used for passing spill address to DDInstGPU + uint64_t last_addr[VSZ]; + uint32_t workitemid[3][VSZ]; + uint32_t workitemFlatId[VSZ]; + uint32_t workgroupid[3]; + uint32_t workgroupsz[3]; + uint32_t gridsz[3]; + uint32_t wg_id; + uint32_t wg_sz; + uint32_t dynwaveid; + uint32_t maxdynwaveid; + uint32_t dispatchid; + // outstanding global+local memory requests + uint32_t outstanding_reqs; + // memory requests between scoreboard + // and execute stage not yet executed + uint32_t mem_reqs_in_pipe; + // outstanding global memory write requests + uint32_t outstanding_reqs_wr_gm; + // outstanding local memory write requests + uint32_t outstanding_reqs_wr_lm; + // outstanding global memory read requests + uint32_t outstanding_reqs_rd_gm; + // outstanding local memory read requests + uint32_t outstanding_reqs_rd_lm; + uint32_t rd_lm_reqs_in_pipe; + uint32_t rd_gm_reqs_in_pipe; + uint32_t wr_lm_reqs_in_pipe; + uint32_t wr_gm_reqs_in_pipe; + + int mem_trace_busy; + uint64_t last_trace; + // number of vector registers reserved by WF + int reservedVectorRegs; + // Index into the Vector Register File's namespace where the WF's registers + // will live while the WF is executed + uint32_t startVgprIndex; + + // Old value of destination gpr (for trace) + uint32_t old_vgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_vgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_vgpr_tcnt; + + // Old value of destination gpr (for trace) + uint64_t old_dgpr[VSZ]; + // Id of destination gpr (for trace) + uint32_t old_dgpr_id; + // Tick count of last old_vgpr copy + uint64_t old_dgpr_tcnt; + + // Execution mask at wavefront start + VectorMask init_mask; + + // number of barriers this WF has joined + int bar_cnt[VSZ]; + int max_bar_cnt; + // Flag to stall a wave on barrier + bool stalledAtBarrier; + + // a pointer to the fraction of the LDS allocated + // to this workgroup (thus this wavefront) + LdsChunk *ldsChunk; + + // A pointer to the spill area + Addr spillBase; + // The size of the spill area + uint32_t spillSizePerItem; + // The vector width of the spill area + uint32_t spillWidth; + + // A pointer to the private memory area + Addr privBase; + // The size of the private memory area + uint32_t privSizePerItem; + + // A pointer ot the read-only memory area + Addr roBase; + // size of the read-only memory area + uint32_t roSize; + + // pointer to buffer for storing kernel arguments + uint8_t *kernelArgs; + // unique WF id over all WFs executed across all CUs + uint64_t wfDynId; + + // number of times instruction issue for this wavefront is blocked + // due to VRF port availability + Stats::Scalar numTimesBlockedDueVrfPortAvail; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueWAXDependencies; + // number of times an instruction of a WF is blocked from being issued + // due to WAR and WAW dependencies + Stats::Scalar numTimesBlockedDueRAWDependencies; + // distribution of executed instructions based on their register + // operands; this is used to highlight the load on the VRF + Stats::Distribution srcRegOpDist; + Stats::Distribution dstRegOpDist; + + // Functions to operate on call argument memory + // argument memory for hsail call instruction + CallArgMem *callArgMem; + void + initCallArgMem(int func_args_size_per_item) + { + callArgMem = new CallArgMem(func_args_size_per_item); + } + + template<typename CType> + CType + readCallArgMem(int lane, int addr) + { + return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); + } + + template<typename CType> + void + writeCallArgMem(int lane, int addr, CType val) + { + callArgMem->setLaneAddr<CType>(lane, addr, val); + } + + typedef WavefrontParams Params; + Wavefront(const Params *p); + ~Wavefront(); + virtual void init(); + + void + setParent(ComputeUnit *cu) + { + computeUnit = cu; + } + + void start(uint64_t _wfDynId, uint64_t _base_ptr); + + void exec(); + void updateResources(); + int ready(itype_e type); + bool instructionBufferHasBranch(); + void regStats(); + VectorMask get_pred() { return execMask() & init_mask; } + + bool waitingAtBarrier(int lane); + + void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, + const VectorMask& exec_mask); + + void popFromReconvergenceStack(); + + uint32_t pc() const; + + uint32_t rpc() const; + + VectorMask execMask() const; + + bool execMask(int lane) const; + + void pc(uint32_t new_pc); + + void discardFetch(); + + private: + /** + * Stack containing Control Flow Graph nodes (i.e., kernel instructions) + * to be visited by the wavefront, and the associated execution masks. The + * reconvergence stack grows every time the wavefront reaches a divergence + * point (branch instruction), and shrinks every time the wavefront + * reaches a reconvergence point (immediate post-dominator instruction). + */ + std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; +}; + +#endif // __WAVEFRONT_HH__ diff --git a/src/mem/protocol/GPU_RfO-SQC.sm b/src/mem/protocol/GPU_RfO-SQC.sm new file mode 100644 index 000000000..1e5f8df74 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-SQC.sm @@ -0,0 +1,667 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:SQC, "GPU SQC (L1 I Cache)") + : Sequencer* sequencer; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 80; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="SQC Cache States", default="SQC_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + + I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet"; + S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack"; + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB"; + } + + enumeration(Event, desc="SQC Events") { + // Core initiated + Fetch, desc="Fetch"; + + //TCC initiated + TCC_AckS, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for WB"; + TCC_NackWB, desc="TCC Nack for WB"; + + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // Probe Events + PrbInvData, desc="probe, return M data"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return SQC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return SQC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(SQC_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromSQC); + out_port(responseNetwork_out, ResponseMsg, responseFromSQC); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // In Ports + + in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseToSQC_in, ResponseMsg, responseToSQC) { + if (responseToSQC_in.isReady(clockEdge())) { + peek(responseToSQC_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (in_msg.State == CoherenceState:Shared) { + trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe); + } else { + error("SQC should not receive TDSysResp other than CoherenceState:Shared"); + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) { + trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + assert(in_msg.Type == RubyRequestType:IFETCH); + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.InitialRequestTime := curCycle(); + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToSQC_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, cache_entry.DataBlk, + false, MachineType:L1Cache); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + + action(xl_loadDone, "xl", desc="remote load done") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToSQC_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToSQC_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // Transitions + + // transitions from base + transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} { + a_allocate; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // simple hit transitions + transition(S, Fetch) {TagArrayRead, DataArrayRead} { + l_loadDone; + p_popMandatoryQueue; + } + + // recycles from transients + transition({I_S, S_I, I_C}, {Fetch, Repl}) {} { + zz_recycleMandatoryQueue; + } + + transition(S, Repl, S_I) {TagArrayRead} { + t_allocateTBE; + vc_victim; + ic_invCache; + } + + // TCC event + transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_I, TCC_NackWB, I){TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S_I, TCC_AckWB, I) {TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_AckWB, I){TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_NackWB, I) {TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + // Probe transitions + transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({S}, PrbShrData, S) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {TagArrayRead} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C){ + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_S, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ic_invCache; + a_allocate; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition(I_S, PrbShrData) {} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(S_I, PrbInvData, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(S_I, PrbInv, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(S_I, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } +} diff --git a/src/mem/protocol/GPU_RfO-TCC.sm b/src/mem/protocol/GPU_RfO-TCC.sm new file mode 100644 index 000000000..cfddb3f00 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCC.sm @@ -0,0 +1,1199 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + WireBuffer * w_reqToTCCDir; + WireBuffer * w_respToTCCDir; + WireBuffer * w_TCCUnblockToTCCDir; + WireBuffer * w_reqToTCC; + WireBuffer * w_probeToTCC; + WireBuffer * w_respToTCC; + int TCC_select_num_bits; + Cycles l2_request_latency := 1; + Cycles l2_response_latency := 20; + + // To the general response network + MessageBuffer * responseFromTCC, network="To", virtual_network="3", vnet_type="response"; + + // From the general response network + MessageBuffer * responseToTCC, network="From", virtual_network="3", vnet_type="response"; + +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="CPU RdBlk event"; + RdBlkM, desc="CPU RdBlkM event"; + RdBlkS, desc="CPU RdBlkS event"; + CtoD, desc="Change to Dirty request"; + WrVicBlk, desc="L1 Victim (dirty)"; + WrVicBlkShared, desc="L1 Victim (dirty)"; + ClVicBlk, desc="L1 Victim (clean)"; + ClVicBlkShared, desc="L1 Victim (clean)"; + + CPUData, desc="WB data from CPU"; + CPUDataShared, desc="WB data from CPU, NBReqShared 1"; + StaleWB, desc="Stale WB, No data"; + + L2_Repl, desc="L2 Replacement"; + + // Probes + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + + // Coming from Memory Controller + WBAck, desc="ack from memory"; + + CancelWB, desc="Cancel WB from L2"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale + O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S + E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory) + S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory + I, AccessPermission:Invalid, desc="Invalid"; + + I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M"; + S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E"; + S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S"; + E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data"; + O_M, AccessPermission:Busy, desc="..."; + O_O, AccessPermission:Busy, desc="..."; + O_E, AccessPermission:Busy, desc="..."; + M_M, AccessPermission:Busy, desc="..."; + M_O, AccessPermission:Busy, desc="..."; + M_E, AccessPermission:Busy, desc="..."; + M_S, AccessPermission:Busy, desc="..."; + D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive"; + MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem"; + MO_I, AccessPermission:Busy, desc="M or O, received L2_Repl, waiting for WBAck from Mem"; + ES_I, AccessPermission:Busy, desc="E or S, received L2_Repl, waiting for WBAck from Mem"; + I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + + // OUT PORTS + out_port(w_requestNetwork_out, CPURequestMsg, w_reqToTCCDir); + out_port(w_TCCResp_out, ResponseMsg, w_respToTCCDir); + out_port(responseNetwork_out, ResponseMsg, responseFromTCC); + out_port(w_unblockNetwork_out, UnblockMsg, w_TCCUnblockToTCCDir); + + // IN PORTS + in_port(TDResponse_in, ResponseMsg, w_respToTCC) { + if (TDResponse_in.isReady(clockEdge())) { + peek(TDResponse_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } + else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on TDResponse Type"); + } + } + } + } + + // Response Network + in_port(responseNetwork_in, ResponseMsg, responseToTCC) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUData) { + if (in_msg.NbReqShared) { + trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on TDResponse Type"); + } + } + } + } + + // probe network + in_port(probeNetwork_in, TDProbeRequestMsg, w_probeToTCC) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.ReturnData) { + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } else { + error("Don't think I should get any of these"); + } + } + } + } + } + + // Request Network + in_port(requestNetwork_in, CPURequestMsg, w_reqToTCC) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + assert(in_msg.Destination.isElement(machineID)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + } + } + } + + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(rm_sendResponseM, "rm", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Modified; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(rs_sendResponseS, "rs", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(r_requestToTD, "r", desc="Miss in L2, pass on") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (is_valid(cache_entry)) { + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := cache_entry.Dirty; + } + tbe.From := machineID; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(vc_vicClean, "vc", desc="Victimize Clean L2 data") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.Requestor := machineID; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(vd_vicDirty, "vd", desc="Victimize dirty L2 data") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.Requestor := machineID; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := cache_entry.DataBlk; + //assert(cache_entry.Dirty); Not needed in TCC where TCC can supply clean data + out_msg.Dirty := cache_entry.Dirty; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + //assert(tbe.Dirty); + out_msg.Dirty := tbe.Dirty; + out_msg.Hit := true; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.State := CoherenceState:NA; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") { + enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(d_writeData, "d", desc="write data to TCC") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + cache_entry.Dirty := in_msg.Dirty; + } + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(rd_copyDataFromRequest, "rd", desc="write data to TCC") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := true; + } + } + + action(f_setFrom, "f", desc="set who WB is expected to come from") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.From := in_msg.Requestor; + } + } + + action(rf_resetFrom, "rf", desc="reset From") { + tbe.From := machineID; + } + + action(wb_data, "wb", desc="write back data") { + enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + } + + action(uo_sendUnblockOwner, "uo", desc="state changed to E, M, or O, unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := true; + out_msg.valid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(us_sendUnblockSharer, "us", desc="state changed to S , unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := false; + out_msg.valid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(un_sendUnblockNotValid, "un", desc="state changed toI, unblock") { + enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.currentOwner := false; + out_msg.valid := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pn_popTDResponseQueue, "pn", desc="pop TD response queue") { + TDResponse_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "\z", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + + transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}){TagArrayRead} { + // TCCdir already knows that the block is not here. This is to allocate and get the block. + r_requestToTD; + p_popRequestQueue; + } + +// check + transition({M, O}, RdBlk, O){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chancing + p_popRequestQueue; + } + +//check + transition({E, S}, RdBlk, S){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chancing + p_popRequestQueue; + } + +// check + transition({M, O}, RdBlkS, O){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chance sharing + p_popRequestQueue; + } + +//check + transition({E, S}, RdBlkS, S){TagArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + // detect 2nd chance sharing + p_popRequestQueue; + } + +// check + transition(M, RdBlkM, I){TagArrayRead, TagArrayWrite} { + rm_sendResponseM; + i_invL2; + p_popRequestQueue; + } + + //check + transition(E, RdBlkM, I){TagArrayRead, TagArrayWrite} { + rm_sendResponseM; + i_invL2; + p_popRequestQueue; + } + +// check + transition({I}, WrVicBlk, I_M){TagArrayRead} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) { + zz_recycleRequestQueue; + } + +//check + transition({I}, WrVicBlkShared, I_O) {TagArrayRead}{ + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition(S, WrVicBlkShared, S_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(S, WrVicBlk, S_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, WrVicBlk, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, WrVicBlkShared, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, WrVicBlk, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, WrVicBlkShared, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, WrVicBlk, M_M){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, WrVicBlkShared, M_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition({I}, ClVicBlk, I_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlkShared, I_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + +//check + transition(S, ClVicBlkShared, S_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, ClVicBlk, E_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(E, ClVicBlkShared, E_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(O, ClVicBlk, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// check. Original L3 ahd it going from O to O_S. Something can go from O to S only on writeback. + transition(O, ClVicBlkShared, O_O){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, ClVicBlk, M_E){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + +// a stale writeback + transition(M, ClVicBlkShared, M_S){TagArrayRead} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + + transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) { + a_allocateBlock; + t_allocateTBE; + f_setFrom; + r_requestToTD; + p_popRequestQueue; + } + + transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) { + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_M, CPUData, M){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUData, E){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(S_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_E, CPUDataShared, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition({D_I}, {CPUData, CPUDataShared}, I){TagArrayWrite} { + un_sendUnblockNotValid; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, {CPUData, CPUDataShared}, MO_I) { + un_sendUnblockNotValid; + rf_resetFrom; + pr_popResponseQueue; + } + + transition({O,S,I}, CPUData) { + pr_popResponseQueue; + } + + transition({M, O}, L2_Repl, MO_I){TagArrayRead, DataArrayRead} { + t_allocateTBE; + vd_vicDirty; + i_invL2; + } + + transition({E, S,}, L2_Repl, ES_I){TagArrayRead, DataArrayRead} { + t_allocateTBE; + vc_vicClean; + i_invL2; + } + + transition({I_M, I_O, S_M, S_O, E_M, E_O}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({O_M, O_O, O_E, M_M, M_O, M_E, M_S}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({I_E, I_S, S_E, S_S, E_E, E_S}, L2_Repl) { + zz_recycleRequestQueue; + } + + transition({M, O}, PrbInvData, I){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + i_invL2; + pp_popProbeQueue; + } + + transition(I, PrbInvData){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({E, S}, PrbInvData, I){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + i_invL2; + pp_popProbeQueue; + } + + transition({M, O, E, S, I}, PrbInv, I){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL2; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O}, PrbShrData, O){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({E, S}, PrbShrData, S){TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(I, PrbShrData){TagArrayRead} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({ES_I,MO_I}, PrbInv, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({ES_I, MO_I}, PrbShrData) { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInv}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(I_C, PrbShrData) { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MOD_I, WBAck, D_I) { + pn_popTDResponseQueue; + } + + transition(MO_I, WBAck, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + // this can only be a spurious CPUData from a shared block. + transition(MO_I, CPUData) { + pr_popResponseQueue; + } + + transition(ES_I, WBAck, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + transition(I_C, {WBAck}, I){TagArrayWrite} { + dt_deallocateTBE; + pn_popTDResponseQueue; + } + + transition({I_M, I_O, I_E, I_S}, StaleWB, I){TagArrayWrite} { + un_sendUnblockNotValid; + dt_deallocateTBE; + i_invL2; + pr_popResponseQueue; + } + + transition({S_S, S_O, S_M, S_E}, StaleWB, S){TagArrayWrite} { + us_sendUnblockSharer; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({E_M, E_O, E_E, E_S}, StaleWB, E){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({O_M, O_O, O_E}, StaleWB, O){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({M_M, M_O, M_E, M_S}, StaleWB, M){TagArrayWrite} { + uo_sendUnblockOwner; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(D_I, StaleWB, I) {TagArrayWrite}{ + un_sendUnblockNotValid; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, StaleWB, MO_I) { + un_sendUnblockNotValid; + rf_resetFrom; + pr_popResponseQueue; + } + +} diff --git a/src/mem/protocol/GPU_RfO-TCCdir.sm b/src/mem/protocol/GPU_RfO-TCCdir.sm new file mode 100644 index 000000000..8f58d6ebb --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCCdir.sm @@ -0,0 +1,2672 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Mithuna Thottethodi + */ + +machine(MachineType:TCCdir, "AMD read-for-ownership directory for TCC (aka GPU L2)") +: CacheMemory * directory; + // Convention: wire buffers are prefixed with "w_" for clarity + WireBuffer * w_reqToTCCDir; + WireBuffer * w_respToTCCDir; + WireBuffer * w_TCCUnblockToTCCDir; + WireBuffer * w_reqToTCC; + WireBuffer * w_probeToTCC; + WireBuffer * w_respToTCC; + int TCC_select_num_bits; + Cycles response_latency := 5; + Cycles directory_latency := 6; + Cycles issue_latency := 120; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="From", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromTCP, network="From", virtual_network="5", vnet_type="unblock"; + + // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC. + MessageBuffer * probeToCore, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response"; + + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue, random="false"; +{ + // STATES + state_declaration(State, desc="Directory states", default="TCCdir_State_I") { + // Base states + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Invalid, desc="Shared"; + E, AccessPermission:Invalid, desc="Shared"; + O, AccessPermission:Invalid, desc="Owner"; + M, AccessPermission:Invalid, desc="Modified"; + + CP_I, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to invalid"; + B_I, AccessPermission:Invalid, desc="Blocked, need not send data after acks are in, going to invalid"; + CP_O, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to owned"; + CP_S, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to shared"; + CP_OM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to O_M"; + CP_SM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to S_M"; + CP_ISM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M"; + CP_IOM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M"; + CP_OSIW, AccessPermission:Invalid, desc="Blocked, must send data after acks+CancelWB are in, going to I_C"; + + + // Transient states and busy states used for handling side (TCC-facing) interactions + BW_S, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_E, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_O, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + BW_M, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock"; + + // Transient states and busy states used for handling upward (TCP-facing) interactions + I_M, AccessPermission:Invalid, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_ES, AccessPermission:Invalid, desc="Invalid, issued RdBlk, have not seen response yet"; + I_S, AccessPermission:Invalid, desc="Invalid, issued RdBlkS, have not seen response yet"; + BBS_S, AccessPermission:Invalid, desc="Blocked, going from S to S"; + BBO_O, AccessPermission:Invalid, desc="Blocked, going from O to O"; + BBM_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for data to forward"; + BBM_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for data to forward"; + BB_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for unblock"; + BB_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for unblock"; + BB_OO, AccessPermission:Invalid, desc="Blocked, going from O to O (adding sharers), waiting for unblock"; + BB_S, AccessPermission:Invalid, desc="Blocked, going to S, waiting for (possible multiple) unblock(s)"; + BBS_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M"; + BBO_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M"; + BBS_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade"; + BBO_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade"; + S_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet"; + O_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet"; + + // + BBB_S, AccessPermission:Invalid, desc="Blocked, going to S after core unblock"; + BBB_M, AccessPermission:Invalid, desc="Blocked, going to M after core unblock"; + BBB_E, AccessPermission:Invalid, desc="Blocked, going to E after core unblock"; + + VES_I, AccessPermission:Invalid, desc="TCC replacement, waiting for clean WB ack"; + VM_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack"; + VO_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack"; + VO_S, AccessPermission:Invalid, desc="TCC owner replacement, waiting for dirty WB ack"; + + ES_I, AccessPermission:Invalid, desc="L1 replacement, waiting for clean WB ack"; + MO_I, AccessPermission:Invalid, desc="L1 replacement, waiting for dirty WB ack"; + + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB for canceled WB"; + I_W, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB; canceled WB raced with directory invalidation"; + + // Recall States + BRWD_I, AccessPermission:Invalid, desc="Recalling, waiting for WBAck and Probe Data responses"; + BRW_I, AccessPermission:Read_Write, desc="Recalling, waiting for WBAck"; + BRD_I, AccessPermission:Invalid, desc="Recalling, waiting for Probe Data responses"; + + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + + // EVENTS + enumeration(Event, desc="TCC Directory Events") { + // Upward facing events (TCCdir w.r.t. TCP/SQC and TCC behaves like NBdir behaves with TCP/SQC and L3 + + // Directory Recall + Recall, desc="directory cache is full"; + // CPU requests + CPUWrite, desc="Initial req from core, sent to TCC"; + NoCPUWrite, desc="Initial req from core, but non-exclusive clean data; can be discarded"; + CPUWriteCancel, desc="Initial req from core, sent to TCC"; + + // Requests from the TCPs + RdBlk, desc="RdBlk event"; + RdBlkM, desc="RdBlkM event"; + RdBlkS, desc="RdBlkS event"; + CtoD, desc="Change to Dirty request"; + + // TCC writebacks + VicDirty, desc="..."; + VicDirtyLast, desc="..."; + VicClean, desc="..."; + NoVic, desc="..."; + StaleVic, desc="..."; + CancelWB, desc="TCC got invalidating probe, canceled WB"; + + // Probe Responses from TCP/SQCs + CPUPrbResp, desc="Probe response from TCP/SQC"; + TCCPrbResp, desc="Probe response from TCC"; + + ProbeAcksComplete, desc="All acks received"; + ProbeAcksCompleteReissue, desc="All acks received, changing CtoD to reissue"; + + CoreUnblock, desc="unblock from TCP/SQC"; + LastCoreUnblock, desc="Last unblock from TCP/SQC"; + TCCUnblock, desc="unblock from TCC (current owner)"; + TCCUnblock_Sharer, desc="unblock from TCC (a sharer, not owner)"; + TCCUnblock_NotValid,desc="unblock from TCC (not valid...caused by stale writebacks)"; + + // Downward facing events + + // NB initiated + NB_AckS, desc="NB Ack to TCC Request"; + NB_AckE, desc="NB Ack to TCC Request"; + NB_AckM, desc="NB Ack to TCC Request"; + NB_AckCtoD, desc="NB Ack to TCC Request"; + NB_AckWB, desc="NB Ack for clean WB"; + + + // Incoming Probes from NB + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + } + + + // TYPES + + // Entry for directory + structure(Entry, desc="...", interface='AbstractCacheEntry') { + State CacheState, desc="Cache state (Cache of directory entries)"; + DataBlock DataBlk, desc="data for the block"; + NetDest Sharers, desc="Sharers for this block"; + NetDest Owner, desc="Owner of this block"; + NetDest MergedSharers, desc="Read sharers who are merged on a request"; + int WaitingUnblocks, desc="Number of acks we're waiting for"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="DataBlk"; + bool Dirty, desc="Is the data dirty?"; + MachineID Requestor, desc="requestor"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID UntransferredOwner, desc = "Untransferred owner for an upgrade transaction"; + bool UntransferredOwnerExists, desc = "1 if Untransferred owner exists for an upgrade transaction"; + bool Cached, desc="data hit in Cache"; + bool Shared, desc="victim hit by shared probe"; + bool Upgrade, desc="An upgrade request in progress"; + bool CtoD, desc="Saved sysack info"; + CoherenceState CohState, desc="Saved sysack info"; + MessageSizeType MessageSize, desc="Saved sysack info"; + MachineID Sender, desc="sender"; + } + + structure(TBETable, external = "yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // ** OBJECTS ** + TBETable TBEs, template="<TCCdir_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + NetDest TCC_dir_subtree; + NetDest temp; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + + + bool presentOrAvail(Addr addr) { + return directory.isTagPresent(addr) || directory.cacheAvail(addr); + } + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", directory.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + assert(false); + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCCdir_State_to_permission(state)); + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCCdir_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCCdir_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + + if (state == State:S) { + assert(cache_entry.Owner.count() == 0); + } + + if (state == State:O) { + assert(cache_entry.Owner.count() == 1); + assert(cache_entry.Sharers.isSuperset(cache_entry.Owner) == false); + } + + if (state == State:M) { + assert(cache_entry.Owner.count() == 1); + assert(cache_entry.Sharers.count() == 0); + } + + if (state == State:E) { + assert(cache_entry.Owner.count() == 0); + assert(cache_entry.Sharers.count() == 1); + } + } + } + + + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + directory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + directory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + directory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + directory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return directory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return directory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return directory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return directory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + + // Class 2: upward facing ports to GPU cores + out_port(probeToCore_out, TDProbeRequestMsg, probeToCore); + out_port(responseToCore_out, ResponseMsg, responseToCore); + + // Class 3: sideward facing ports (on "wirebuffer" links) to TCC + out_port(w_requestTCC_out, CPURequestMsg, w_reqToTCC); + out_port(w_probeTCC_out, NBProbeRequestMsg, w_probeToTCC); + out_port(w_respTCC_out, ResponseMsg, w_respToTCC); + + + // local trigger port + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // + // request queue going to NB + // + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=8) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + assert(is_valid(tbe)); + Entry cache_entry := getCacheEntry(in_msg.addr); + if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == false)) { + trigger(Event:ProbeAcksComplete, in_msg.addr, cache_entry, tbe); + } else if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == true)) { + trigger(Event:ProbeAcksCompleteReissue, in_msg.addr, cache_entry, tbe); + } + } + } + } + + // Unblock Networks (TCCdir can receive unblocks from TCC, TCPs) + // Port on first (of three) wire buffers from TCC + in_port(w_TCCUnblock_in, UnblockMsg, w_TCCUnblockToTCCDir, rank=7) { + if (w_TCCUnblock_in.isReady(clockEdge())) { + peek(w_TCCUnblock_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.currentOwner) { + trigger(Event:TCCUnblock, in_msg.addr, cache_entry, tbe); + } else if (in_msg.valid) { + trigger(Event:TCCUnblock_Sharer, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:TCCUnblock_NotValid, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(unblockNetwork_in, UnblockMsg, unblockFromTCP, rank=6) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if(cache_entry.WaitingUnblocks == 1) { + trigger(Event:LastCoreUnblock, in_msg.addr, cache_entry, tbe); + } + else { + trigger(Event:CoreUnblock, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + //Responses from TCC, and Cores + // Port on second (of three) wire buffers from TCC + in_port(w_TCCResponse_in, ResponseMsg, w_respToTCCDir, rank=5) { + if (w_TCCResponse_in.isReady(clockEdge())) { + peek(w_TCCResponse_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:TCCPrbResp, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseNetwork_in, ResponseMsg, responseFromTCP, rank=4) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + // Port on third (of three) wire buffers from TCC + in_port(w_TCCRequest_in, CPURequestMsg, w_reqToTCCDir, rank=3) { + if(w_TCCRequest_in.isReady(clockEdge())) { + peek(w_TCCRequest_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WrCancel) { + trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) { + // if modified, or owner with no other sharers + if ((cache_entry.CacheState == State:M) || (cache_entry.Sharers.count() == 0)) { + assert(cache_entry.Owner.count()==1); + trigger(Event:VicDirtyLast, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:VicDirty, in_msg.addr, cache_entry, tbe); + } + } else { + trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe); + } + } else { + if (in_msg.Type == CoherenceRequestType:VicClean) { + if (is_valid(cache_entry) && cache_entry.Sharers.isElement(in_msg.Requestor)) { + if (cache_entry.Sharers.count() == 1) { + // Last copy, victimize to L3 + trigger(Event:VicClean, in_msg.addr, cache_entry, tbe); + } else { + // Either not the last copy or stall. No need to victimmize + // remove sharer from sharer list + assert(cache_entry.Sharers.count() > 1); + trigger(Event:NoVic, in_msg.addr, cache_entry, tbe); + } + } else { + trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe); + } + } + } + } + } + } + + in_port(responseFromNB_in, ResponseMsg, responseFromNB, rank=2) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + if (in_msg.CtoD) { + trigger(Event:NB_AckCtoD, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB, rank=1) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Type == CoherenceRequestType:VicDirty) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if(is_valid(cache_entry) && (cache_entry.Sharers.count() + cache_entry.Owner.count() ) >1) { + trigger(Event:NoCPUWrite, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WrCancel) { + trigger(Event:CPUWriteCancel, in_msg.addr, cache_entry, tbe); + } + } else { + // All requests require a directory entry + Addr victim := directory.cacheProbe(in_msg.addr); + trigger(Event:Recall, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + + + + + // Actions + + //Downward facing actions + + action(c_clearOwner, "c", desc="Clear the owner field") { + cache_entry.Owner.clear(); + } + + action(rS_removeRequesterFromSharers, "rS", desc="Remove unblocker from sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + } + } + + action(rT_removeTCCFromSharers, "rT", desc="Remove TCC from sharer list") { + peek(w_TCCRequest_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rO_removeOriginalRequestorFromSharers, "rO", desc="Remove replacing core from sharer list") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rC_removeCoreFromSharers, "rC", desc="Remove replacing core from sharer list") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + } + } + + action(rCo_removeCoreFromOwner, "rCo", desc="Remove replacing core from sharer list") { + // Note that under some cases this action will try to remove a stale owner + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.Owner.remove(in_msg.Requestor); + } + } + + action(rR_removeResponderFromSharers, "rR", desc="Remove responder from sharer list") { + peek(responseNetwork_in, ResponseMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + } + } + + action(nC_sendNullWBAckToCore, "nC", desc = "send a null WB Ack to release core") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBNack; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + + action(nT_sendNullWBAckToTCC, "nT", desc = "send a null WB Ack to release TCC") { + peek(w_TCCRequest_in, CPURequestMsg) { + enqueue(w_respTCC_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + + action(eto_moveExSharerToOwner, "eto", desc="move the current exclusive sharer to owner") { + assert(cache_entry.Sharers.count() == 1); + assert(cache_entry.Owner.count() == 0); + cache_entry.Owner := cache_entry.Sharers; + cache_entry.Sharers.clear(); + APPEND_TRANSITION_COMMENT(" new owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + + action(aT_addTCCToSharers, "aT", desc="Add TCC to sharer list") { + peek(w_TCCUnblock_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + } + } + + action(as_addToSharers, "as", desc="Add unblocker to sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + } + } + + action(c_moveOwnerToSharer, "cc", desc="Move owner to sharers") { + cache_entry.Sharers.addNetDest(cache_entry.Owner); + cache_entry.Owner.clear(); + } + + action(cc_clearSharers, "\c", desc="Clear the sharers field") { + cache_entry.Sharers.clear(); + } + + action(e_ownerIsUnblocker, "e", desc="The owner is now the unblocker") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Owner.clear(); + cache_entry.Owner.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" tcp_ub owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(eT_ownerIsUnblocker, "eT", desc="TCC (unblocker) is now owner") { + peek(w_TCCUnblock_in, UnblockMsg) { + cache_entry.Owner.clear(); + cache_entry.Owner.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" tcc_ub owner "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(ctr_copyTCCResponseToTBE, "ctr", desc="Copy TCC probe response data to TBE") { + peek(w_TCCResponse_in, ResponseMsg) { + // Overwrite data if tbe does not hold dirty data. Stop once it is dirty. + if(tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.Sender := in_msg.Sender; + } + DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk)); + } + } + + action(ccr_copyCoreResponseToTBE, "ccr", desc="Copy core probe response data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + // Overwrite data if tbe does not hold dirty data. Stop once it is dirty. + if(tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + + if(tbe.Sender == machineID) { + tbe.Sender := in_msg.Sender; + } + } + DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk)); + } + } + + action(cd_clearDirtyBitTBE, "cd", desc="Clear Dirty bit in TBE") { + tbe.Dirty := false; + } + + action(n_issueRdBlk, "n-", desc="Issue RdBlk") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(rU_rememberUpgrade, "rU", desc="Remember that this was an upgrade") { + tbe.Upgrade := true; + } + + action(ruo_rememberUntransferredOwner, "ruo", desc="Remember the untransferred owner") { + peek(responseNetwork_in, ResponseMsg) { + if(in_msg.UntransferredOwner == true) { + tbe.UntransferredOwner := in_msg.Sender; + tbe.UntransferredOwnerExists := true; + } + DPRINTF(RubySlicc, "%s\n", (in_msg)); + } + } + + action(ruoT_rememberUntransferredOwnerTCC, "ruoT", desc="Remember the untransferred owner") { + peek(w_TCCResponse_in, ResponseMsg) { + if(in_msg.UntransferredOwner == true) { + tbe.UntransferredOwner := in_msg.Sender; + tbe.UntransferredOwnerExists := true; + } + DPRINTF(RubySlicc, "%s\n", (in_msg)); + } + } + + action(vd_victim, "vd", desc="Victimize M/O Data") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := true; + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := false; + } + } + + + action(sT_sendRequestToTCC, "sT", desc="send request to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(w_requestTCC_out, CPURequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + } + APPEND_TRANSITION_COMMENT(" requestor "); + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + + } + } + + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + + temp := cache_entry.Sharers; + temp.addNetDest(cache_entry.Owner); + if (temp.isElement(tcc)) { + temp.remove(tcc); + } + if (temp.count() > 0) { + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := temp; + tbe.NumPendingAcks := temp.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + + action(ls2_probeShrL2Data, "ls2", desc="local probe downgrade L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(s2_probeShrL2Data, "s2", desc="probe shared L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(ldc_probeInvCoreData, "ldc", desc="local probe to inv cores, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + peek(coreRequestNetwork_in, CPURequestMsg) { + NetDest dest:= cache_entry.Sharers; + dest.addNetDest(cache_entry.Owner); + if(dest.isElement(tcc)){ + dest.remove(tcc); + } + dest.remove(in_msg.Requestor); + tbe.NumPendingAcks := dest.count(); + if (dest.count()>0){ + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(dest); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + } + + action(ld2_probeInvL2Data, "ld2", desc="local probe inv L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores + TCC, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(cache_entry.Sharers); + out_msg.Destination.addNetDest(cache_entry.Owner); + tbe.NumPendingAcks := cache_entry.Sharers.count() + cache_entry.Owner.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + if (out_msg.Destination.isElement(tcc)) { + out_msg.Destination.remove(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + + action(d2_probeInvL2Data, "d2", desc="probe inv L2, return data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(lpc_probeInvCore, "lpc", desc="local probe inv cores, no data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + TCC_dir_subtree.broadcast(MachineType:TCP); + TCC_dir_subtree.broadcast(MachineType:SQC); + + temp := cache_entry.Sharers; + temp := temp.OR(cache_entry.Owner); + TCC_dir_subtree := TCC_dir_subtree.AND(temp); + tbe.NumPendingAcks := TCC_dir_subtree.count(); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + if(TCC_dir_subtree.isElement(in_msg.Requestor)) { + TCC_dir_subtree.remove(in_msg.Requestor); + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + if(TCC_dir_subtree.count() > 0) { + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.localCtoD := true; + + out_msg.Destination.addNetDest(TCC_dir_subtree); + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + } + + action(ipc_probeInvCore, "ipc", desc="probe inv cores, no data") { + TCC_dir_subtree.broadcast(MachineType:TCP); + TCC_dir_subtree.broadcast(MachineType:SQC); + + temp := cache_entry.Sharers; + temp := temp.OR(cache_entry.Owner); + TCC_dir_subtree := TCC_dir_subtree.AND(temp); + tbe.NumPendingAcks := TCC_dir_subtree.count(); + if(TCC_dir_subtree.count() > 0) { + + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + + out_msg.Destination.addNetDest(TCC_dir_subtree); + if(cache_entry.CacheState == State:M) { + assert(tbe.NumPendingAcks == 1); + } + + DPRINTF(RubySlicc, "%s\n", (out_msg)); + } + } + } + + action(i2_probeInvL2, "i2", desc="probe inv L2, no data") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + DPRINTF(RubySlicc, "%s\n", out_msg); + + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(mc_cancelWB, "mc", desc="send writeback cancel to NB directory") { + enqueue(requestToNB_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(sCS_sendCollectiveResponseS, "sCS", desc="send shared response to all merged TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Shared; + out_msg.Destination.addNetDest(cache_entry.MergedSharers); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sS_sendResponseS, "sS", desc="send shared response to TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Shared; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sM_sendResponseM, "sM", desc="send response to TCP/SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := tbe.Sender; + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.CtoD := false; + out_msg.State := CoherenceState:Modified; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.Dirty := tbe.Dirty; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + + action(fw2_forwardWBAck, "fw2", desc="forward WBAck to TCC") { + peek(responseFromNB_in, ResponseMsg) { + if(tbe.OriginalRequestor != machineID) { + enqueue(w_respTCC_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Sender := machineID; + //out_msg.DataBlk := tbe.DataBlk; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := in_msg.MessageSize; + } + } + } + } + + action(sa_saveSysAck, "sa", desc="Save SysAck ") { + peek(responseFromNB_in, ResponseMsg) { + tbe.Dirty := in_msg.Dirty; + if (tbe.Dirty == false) { + tbe.DataBlk := in_msg.DataBlk; + } + else { + tbe.DataBlk := tbe.DataBlk; + } + tbe.CtoD := in_msg.CtoD; + tbe.CohState := in_msg.State; + tbe.Shared := in_msg.Shared; + tbe.MessageSize := in_msg.MessageSize; + } + } + + action(fsa_forwardSavedAck, "fsa", desc="forward saved SysAck to TCP or SQC") { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + if (tbe.Dirty == false) { + out_msg.DataBlk := tbe.DataBlk; + } + else { + out_msg.DataBlk := tbe.DataBlk; + } + out_msg.CtoD := tbe.CtoD; + out_msg.State := tbe.CohState; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := tbe.Shared; + out_msg.MessageSize := tbe.MessageSize; + out_msg.Dirty := tbe.Dirty; + out_msg.Sender := tbe.Sender; + } + } + + action(fa_forwardSysAck, "fa", desc="forward SysAck to TCP or SQC") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + if (tbe.Dirty == false) { + out_msg.DataBlk := in_msg.DataBlk; + tbe.Sender := machineID; + } + else { + out_msg.DataBlk := tbe.DataBlk; + } + out_msg.CtoD := in_msg.CtoD; + out_msg.State := in_msg.State; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Dirty := in_msg.Dirty; + out_msg.Sender := tbe.Sender; + DPRINTF(RubySlicc, "%s\n", (out_msg.DataBlk)); + } + } + } + + action(pso_probeSharedDataOwner, "pso", desc="probe shared data at owner") { + MachineID tcc := mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + if (cache_entry.Owner.isElement(tcc)) { + enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(tcc); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + else { // i.e., owner is a core + enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.addNetDest(cache_entry.Owner); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + tbe.NumPendingAcks := 1; + } + + action(i_popIncomingRequestQueue, "i", desc="Pop incoming request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(j_popIncomingUnblockQueue, "j", desc="Pop incoming unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(pk_popResponseQueue, "pk", desc="Pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="Pop incoming probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(pR_popResponseFromNBQueue, "pR", desc="Pop incoming Response queue From NB") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pl_popTCCRequestQueue, "pl", desc="pop TCC request queue") { + w_TCCRequest_in.dequeue(clockEdge()); + } + + action(plr_popTCCResponseQueue, "plr", desc="pop TCC response queue") { + w_TCCResponse_in.dequeue(clockEdge()); + } + + action(plu_popTCCUnblockQueue, "plu", desc="pop TCC unblock queue") { + w_TCCUnblock_in.dequeue(clockEdge()); + } + + + action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") { + peek(unblockNetwork_in, UnblockMsg) { + cache_entry.Sharers.add(in_msg.Sender); + cache_entry.MergedSharers.remove(in_msg.Sender); + assert(cache_entry.WaitingUnblocks >= 0); + cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks - 1; + } + } + + action(q_addOutstandingMergedSharer, "q", desc="Increment outstanding requests") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.MergedSharers.add(in_msg.Requestor); + cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks + 1; + } + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockToNB_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(zz_recycleRequest, "\z", desc="Recycle the request queue") { + coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleTCCRequestQueue, "yy", desc="recycle yy request queue") { + w_TCCRequest_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xz_recycleResponseQueue, "xz", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xx_recycleTCCResponseQueue, "xx", desc="recycle TCC response queue") { + w_TCCResponse_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(vv_recycleTCCUnblockQueue, "vv", desc="Recycle the probe request queue") { + w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xy_recycleUnblockQueue, "xy", desc="Recycle the probe request queue") { + w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(ww_recycleProbeRequest, "ww", desc="Recycle the probe request queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } + + action(o_checkForAckCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" tbe acks "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(tp_allocateTBE, "tp", desc="allocate TBE Entry for upward transactions") { + check_allocate(TBEs); + peek(probeNetwork_in, NBProbeRequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Dirty := false; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + } + + action(tv_allocateTBE, "tv", desc="allocate TBE Entry for TCC transactions") { + check_allocate(TBEs); + peek(w_TCCRequest_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := in_msg.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs);//check whether resources are full + peek(coreRequestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.Upgrade := false; + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + tbe.Sender := machineID; + } + } + + action(tr_allocateTBE, "tr", desc="allocate TBE Entry for recall") { + check_allocate(TBEs);//check whether resources are full + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.Upgrade := false; + tbe.OriginalRequestor := machineID; //Recall request, Self initiated + tbe.NumPendingAcks := 0; + tbe.UntransferredOwnerExists := false; + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + + action(d_allocateDir, "d", desc="allocate Directory Cache") { + if (is_invalid(cache_entry)) { + set_cache_entry(directory.allocate(address, new Entry)); + } + } + + action(dd_deallocateDir, "dd", desc="deallocate Directory Cache") { + if (is_valid(cache_entry)) { + directory.deallocate(address); + } + unset_cache_entry(); + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(wb_data, "wb", desc="write back data") { + enqueue(responseToNB_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(y_writeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (!tbe.Dirty || in_msg.Dirty) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(ty_writeTCCDataToTBE, "ty", desc="write TCC Probe Data to TBE") { + peek(w_TCCResponse_in, ResponseMsg) { + if (!tbe.Dirty || in_msg.Dirty) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + directory.setMRU(address); + } + + // TRANSITIONS + + // Handling TCP/SQC requests (similar to how NB dir handles TCC events with some changes to account for stateful directory). + + + // transitions from base + transition(I, RdBlk, I_ES){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + n_issueRdBlk; + i_popIncomingRequestQueue; + } + + transition(I, RdBlkS, I_S){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + nS_issueRdBlkS; + i_popIncomingRequestQueue; + } + + + transition(I_S, NB_AckS, BBB_S) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_ES, NB_AckS, BBB_S) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_ES, NB_AckE, BBB_E) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition({S_M, O_M}, {NB_AckCtoD,NB_AckM}, BBB_M) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(I_M, NB_AckM, BBB_M) { + fa_forwardSysAck; + pR_popResponseFromNBQueue; + } + + transition(BBB_M, CoreUnblock, M){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + e_ownerIsUnblocker; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(BBB_S, CoreUnblock, S){TagArrayWrite} { + as_addToSharers; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(BBB_E, CoreUnblock, E){TagArrayWrite} { + as_addToSharers; + uu_sendUnblock; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + + transition(I, RdBlkM, I_M){TagArrayRead} { + d_allocateDir; + t_allocateTBE; + nM_issueRdBlkM; + i_popIncomingRequestQueue; + } + + // + transition(S, {RdBlk, RdBlkS}, BBS_S){TagArrayRead} { + t_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Merging of read sharing into a single request + transition(BBS_S, {RdBlk, RdBlkS}) { + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Wait for probe acks to be complete + transition(BBS_S, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_S, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + // Window for merging complete with this transition + // Send responses to all outstanding + transition(BBS_S, ProbeAcksComplete, BB_S) { + sCS_sendCollectiveResponseS; + pt_popTriggerQueue; + } + + transition(BB_S, CoreUnblock, BB_S) { + m_addUnlockerToSharers; + j_popIncomingUnblockQueue; + } + + transition(BB_S, LastCoreUnblock, S) { + m_addUnlockerToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(O, {RdBlk, RdBlkS}, BBO_O){TagArrayRead} { + t_allocateTBE; + pso_probeSharedDataOwner; + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + // Merging of read sharing into a single request + transition(BBO_O, {RdBlk, RdBlkS}) { + q_addOutstandingMergedSharer; + i_popIncomingRequestQueue; + } + + // Wait for probe acks to be complete + transition(BBO_O, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_O, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + // Window for merging complete with this transition + // Send responses to all outstanding + transition(BBO_O, ProbeAcksComplete, BB_OO) { + sCS_sendCollectiveResponseS; + pt_popTriggerQueue; + } + + transition(BB_OO, CoreUnblock) { + m_addUnlockerToSharers; + j_popIncomingUnblockQueue; + } + + transition(BB_OO, LastCoreUnblock, O){TagArrayWrite} { + m_addUnlockerToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(S, CPUWrite, BW_S){TagArrayRead} { + t_allocateTBE; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(E, CPUWrite, BW_E){TagArrayRead} { + t_allocateTBE; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(O, CPUWrite, BW_O){TagArrayRead} { + t_allocateTBE; + rCo_removeCoreFromOwner; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(M, CPUWrite, BW_M){TagArrayRead} { + t_allocateTBE; + rCo_removeCoreFromOwner; + rC_removeCoreFromSharers; + sT_sendRequestToTCC; + i_popIncomingRequestQueue; + } + + transition(BW_S, TCCUnblock_Sharer, S){TagArrayWrite} { + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_S, TCCUnblock_NotValid, S){TagArrayWrite} { + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_E, TCCUnblock, E){TagArrayWrite} { + cc_clearSharers; + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_E, TCCUnblock_NotValid, E) { + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_M, TCCUnblock, M) { + c_clearOwner; + cc_clearSharers; + eT_ownerIsUnblocker; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_M, TCCUnblock_NotValid, M) { + // Note this transition should only be executed if we received a stale wb + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_O, TCCUnblock, O) { + c_clearOwner; + eT_ownerIsUnblocker; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition(BW_O, TCCUnblock_NotValid, O) { + // Note this transition should only be executed if we received a stale wb + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + // We lost the owner likely do to an invalidation racing with a 'O' wb + transition(BW_O, TCCUnblock_Sharer, S) { + c_clearOwner; + aT_addTCCToSharers; + dt_deallocateTBE; + plu_popTCCUnblockQueue; + } + + transition({BW_M, BW_S, BW_E, BW_O}, {PrbInv,PrbInvData,PrbShrData}) { + ww_recycleProbeRequest; + } + + transition(BRWD_I, {PrbInvData, PrbInv, PrbShrData}) { + ww_recycleProbeRequest; + } + + // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD + transition(S, CtoD, BBS_UM) {TagArrayRead} { + t_allocateTBE; + lpc_probeInvCore; + i2_probeInvL2; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBS_UM, CPUPrbResp, BBS_UM) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_UM, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBS_UM, ProbeAcksComplete, S_M) { + rU_rememberUpgrade; + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD + transition(O, CtoD, BBO_UM){TagArrayRead} { + t_allocateTBE; + lpc_probeInvCore; + i2_probeInvL2; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBO_UM, CPUPrbResp, BBO_UM) { + ruo_rememberUntransferredOwner; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_UM, TCCPrbResp) { + ruoT_rememberUntransferredOwnerTCC; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBO_UM, ProbeAcksComplete, O_M) { + rU_rememberUpgrade; + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + transition({S,E}, RdBlkM, BBS_M){TagArrayWrite} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBS_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + rR_removeResponderFromSharers; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBS_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBS_M, ProbeAcksComplete, S_M) { + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + transition(O, RdBlkM, BBO_M){TagArrayRead} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + o_checkForAckCompletion; + i_popIncomingRequestQueue; + } + + transition(BBO_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + rR_removeResponderFromSharers; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(BBO_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBO_M, ProbeAcksComplete, O_M) { + nM_issueRdBlkM; + pt_popTriggerQueue; + } + + // + transition(M, RdBlkM, BBM_M){TagArrayRead} { + t_allocateTBE; + ldc_probeInvCoreData; + ld2_probeInvL2Data; + i_popIncomingRequestQueue; + } + + transition(BBM_M, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + // TCP recalled block before receiving probe + transition({BBM_M, BBS_M, BBO_M}, {CPUWrite,NoCPUWrite}) { + zz_recycleRequest; + } + + transition(BBM_M, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BBM_M, ProbeAcksComplete, BB_M) { + sM_sendResponseM; + pt_popTriggerQueue; + } + + transition(BB_M, CoreUnblock, M){TagArrayWrite} { + e_ownerIsUnblocker; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition(M, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} { + t_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + i_popIncomingRequestQueue; + } + + transition(E, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} { + t_allocateTBE; + eto_moveExSharerToOwner; + sc_probeShrCoreData; + s2_probeShrL2Data; + i_popIncomingRequestQueue; + } + + transition(BBM_O, CPUPrbResp) { + ccr_copyCoreResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + transition(BBM_O, TCCPrbResp) { + ctr_copyTCCResponseToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + transition(BBM_O, ProbeAcksComplete, BB_O) { + sS_sendResponseS; + pt_popTriggerQueue; + } + + transition(BB_O, CoreUnblock, O){TagArrayWrite} { + as_addToSharers; + dt_deallocateTBE; + j_popIncomingUnblockQueue; + } + + transition({BBO_O, BBM_M, BBS_S, BBM_O, BB_M, BB_O, BB_S, BBO_UM, BBS_UM, BBS_M, BBO_M, BB_OO}, {PrbInvData, PrbInv,PrbShrData}) { + ww_recycleProbeRequest; + } + + transition({BBM_O, BBS_S, CP_S, CP_O, CP_SM, CP_OM, BBO_O}, {CPUWrite,NoCPUWrite}) { + zz_recycleRequest; + } + + // stale CtoD raced with external invalidation + transition({I, CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, CtoD) { + i_popIncomingRequestQueue; + } + + // stale CtoD raced with internal RdBlkM + transition({BBM_M, BBS_M, BBO_M, BBB_M, BBS_UM, BBO_UM}, CtoD) { + i_popIncomingRequestQueue; + } + + transition({E, M}, CtoD) { + i_popIncomingRequestQueue; + } + + + // TCC-directory has sent out (And potentially received acks for) probes. + // TCP/SQC replacement (known to be stale subsequent) are popped off. + transition({BBO_UM, BBS_UM}, {CPUWrite,NoCPUWrite}) { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition(S_M, {NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + transition(O_M, {NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + + transition({BBM_M, BBS_M, BBO_M, BBO_UM, BBS_UM}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({CP_S, CP_O, CP_OM, CP_SM}, {VicDirty, VicClean, VicDirtyLast, CancelWB, NoVic}) { + yy_recycleTCCRequestQueue; + } + + // However, when TCCdir has sent out PrbSharedData, one cannot ignore. + transition({BBS_S, BBO_O, BBM_O, S_M, O_M, BBB_M, BBB_S, BBB_E}, {VicDirty, VicClean, VicDirtyLast,CancelWB}) { + yy_recycleTCCRequestQueue; + } + + transition({BW_S,BW_E,BW_O, BW_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + yy_recycleTCCRequestQueue; + } + + transition({BW_S,BW_E,BW_O, BW_M}, CancelWB) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + + /// recycle if waiting for unblocks. + transition({BB_M,BB_O,BB_S,BB_OO}, {VicDirty, VicClean, VicDirtyLast,NoVic,CancelWB}) { + yy_recycleTCCRequestQueue; + } + + transition({BBS_S, BBO_O}, NoVic) { + rT_removeTCCFromSharers; + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // stale. Pop message and send dummy ack. + transition({I_S, I_ES, I_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition(M, VicDirtyLast, VM_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(E, VicDirty, VM_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(O, VicDirty, VO_S){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition(O, {VicDirtyLast, VicClean}, VO_I){TagArrayRead} { + tv_allocateTBE; + vd_victim; + pl_popTCCRequestQueue; + } + + transition({E, S}, VicClean, VES_I){TagArrayRead} { + tv_allocateTBE; + vc_victim; + pl_popTCCRequestQueue; + } + + transition({O, S}, NoVic){TagArrayRead} { + rT_removeTCCFromSharers; + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({O,S}, NoCPUWrite){TagArrayRead} { + rC_removeCoreFromSharers; + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition({M,E}, NoCPUWrite){TagArrayRead} { + rC_removeCoreFromSharers; + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + // This can only happen if it is race. (TCCdir sent out probes which caused this cancel in the first place.) + transition({VM_I, VES_I, VO_I}, CancelWB) { + pl_popTCCRequestQueue; + } + + transition({VM_I, VES_I, VO_I}, NB_AckWB, I){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + wb_data; + fw2_forwardWBAck; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(VO_S, NB_AckWB, S){TagArrayWrite} { + c_clearOwner; + wb_data; + fw2_forwardWBAck; + dt_deallocateTBE; + pR_popResponseFromNBQueue; + } + + transition(I_C, NB_AckWB, I){TagArrayWrite} { + c_clearOwner; + cc_clearSharers; + ss_sendStaleNotification; + fw2_forwardWBAck; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(I_W, NB_AckWB, I) { + ss_sendStaleNotification; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + + + // Do not handle replacements, reads of any kind or writebacks from transients; recycle + transition({I_M, I_ES, I_S, MO_I, ES_I, S_M, O_M, VES_I, VO_I, VO_S, VM_I, I_C, I_W}, {RdBlkS,RdBlkM,RdBlk,CtoD}) { + zz_recycleRequest; + } + + transition( VO_S, NoCPUWrite) { + zz_recycleRequest; + } + + transition({BW_M, BW_S, BW_O, BW_E}, {RdBlkS,RdBlkM,RdBlk,CtoD,NoCPUWrite, CPUWrite}) { + zz_recycleRequest; + } + + transition({BBB_M, BBB_S, BBB_E, BB_O, BB_M, BB_S, BB_OO}, { RdBlk, RdBlkS, RdBlkM, CPUWrite, NoCPUWrite}) { + zz_recycleRequest; + } + + transition({BBB_S, BBB_E, BB_O, BB_S, BB_OO}, { CtoD}) { + zz_recycleRequest; + } + + transition({BBS_UM, BBO_UM, BBM_M, BBM_O, BBS_M, BBO_M}, { RdBlk, RdBlkS, RdBlkM}) { + zz_recycleRequest; + } + + transition(BBM_O, CtoD) { + zz_recycleRequest; + } + + transition({BBS_S, BBO_O}, {RdBlkM, CtoD}) { + zz_recycleRequest; + } + + transition({B_I, CP_I, CP_S, CP_O, CP_OM, CP_SM, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {RdBlk, RdBlkS, RdBlkM}) { + zz_recycleRequest; + } + + transition({CP_O, CP_S, CP_OM}, CtoD) { + zz_recycleRequest; + } + + // Ignore replacement related messages after probe got in. + transition({CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {CPUWrite, NoCPUWrite}) { + zz_recycleRequest; + } + + // Ignore replacement related messages after probes processed + transition({I, I_S, I_ES, I_M, I_C, I_W}, {CPUWrite,NoCPUWrite}) { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + // cannot ignore cancel... otherwise TCP/SQC will be stuck in I_C + transition({I, I_S, I_ES, I_M, I_C, I_W, S_M, M, O, E, S}, CPUWriteCancel){TagArrayRead} { + nC_sendNullWBAckToCore; + i_popIncomingRequestQueue; + } + + transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, {NoVic, VicClean, VicDirty, VicDirtyLast}){ + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // Handling Probes from NB (General process: (1) propagate up, go to blocking state (2) process acks (3) on last ack downward.) + + // step 1 + transition({M, O, E, S}, PrbInvData, CP_I){TagArrayRead} { + tp_allocateTBE; + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // step 2a + transition(CP_I, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_I, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_I, ProbeAcksComplete, I){TagArrayWrite} { + pd_sendProbeResponseData; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition({M, O, E, S}, PrbInv, B_I){TagArrayWrite} { + tp_allocateTBE; + ipc_probeInvCore; + i2_probeInvL2; + pp_popProbeQueue; + } + // step 2 + transition(B_I, CPUPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(B_I, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(B_I, ProbeAcksComplete, I){TagArrayWrite} { + // send response down to NB + pi_sendProbeResponseInv; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + + // step 1 + transition({M, O}, PrbShrData, CP_O){TagArrayRead} { + tp_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + + transition(E, PrbShrData, CP_O){TagArrayRead} { + tp_allocateTBE; + eto_moveExSharerToOwner; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_O, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_O, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_O, ProbeAcksComplete, O){TagArrayWrite} { + // send response down to NB + pd_sendProbeResponseData; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + //step 1 + transition(S, PrbShrData, CP_S) { + tp_allocateTBE; + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_S, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_S, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_S, ProbeAcksComplete, S) { + // send response down to NB + pd_sendProbeResponseData; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + // step 1 + transition(O_M, PrbInvData, CP_IOM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // step 2a + transition(CP_IOM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_IOM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_IOM, ProbeAcksComplete, I_M) { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + cd_clearDirtyBitTBE; + pt_popTriggerQueue; + } + + transition(CP_IOM, ProbeAcksCompleteReissue, I){TagArrayWrite} { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition(S_M, PrbInvData, CP_ISM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + o_checkForAckCompletion; + pp_popProbeQueue; + } + // step 2a + transition(CP_ISM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_ISM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_ISM, ProbeAcksComplete, I_M) { + pdm_sendProbeResponseDataMs; + c_clearOwner; + cc_clearSharers; + cd_clearDirtyBitTBE; + + //dt_deallocateTBE; + pt_popTriggerQueue; + } + transition(CP_ISM, ProbeAcksCompleteReissue, I){TagArrayWrite} { + pim_sendProbeResponseInvMs; + c_clearOwner; + cc_clearSharers; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // step 1 + transition({S_M, O_M}, {PrbInv}, CP_ISM) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + // next steps inherited from BS_ISM + + // Simpler cases + + transition({I_C, I_W}, {PrbInvData, PrbInv, PrbShrData}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + //If the directory is certain that the block is not present, one can send an acknowledgement right away. + // No need for three step process. + transition(I, {PrbInv,PrbShrData,PrbInvData}){TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({I_M, I_ES, I_S}, {PrbInv, PrbInvData}) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({I_M, I_ES, I_S}, PrbShrData) { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + //step 1 + transition(S_M, PrbShrData, CP_SM) { + sc_probeShrCoreData; + s2_probeShrL2Data; + o_checkForAckCompletion; + pp_popProbeQueue; + } + // step 2 + transition(CP_SM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_SM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_SM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, S_M){DataArrayRead} { + // send response down to NB + pd_sendProbeResponseData; + pt_popTriggerQueue; + } + + //step 1 + transition(O_M, PrbShrData, CP_OM) { + sc_probeShrCoreData; + s2_probeShrL2Data; + pp_popProbeQueue; + } + // step 2 + transition(CP_OM, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + // step 2b + transition(CP_OM, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + // step 3 + transition(CP_OM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, O_M) { + // send response down to NB + pd_sendProbeResponseData; + pt_popTriggerQueue; + } + + transition(BRW_I, PrbInvData, I_W) { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({VM_I,VO_I}, PrbInvData, I_C) { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(VES_I, {PrbInvData,PrbInv}, I_C) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({VM_I, VO_I, BRW_I}, PrbInv, I_W) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({VM_I, VO_I, VO_S, VES_I, BRW_I}, PrbShrData) { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + + transition(VO_S, PrbInvData, CP_OSIW) { + dc_probeInvCoreData; + d2_probeInvL2Data; + pp_popProbeQueue; + } + + transition(CP_OSIW, TCCPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + transition(CP_OSIW, CPUPrbResp) { + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition(CP_OSIW, ProbeAcksComplete, I_C) { + pd_sendProbeResponseData; + cd_clearDirtyBitTBE; + pt_popTriggerQueue; + } + + transition({I, S, E, O, M, CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W}, StaleVic) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, StaleVic) { + nT_sendNullWBAckToTCC; + pl_popTCCRequestQueue; + } + + // Recall Transistions + // transient states still require the directory state + transition({M, O}, Recall, BRWD_I) { + tr_allocateTBE; + vd_victim; + dc_probeInvCoreData; + d2_probeInvL2Data; + } + + transition({E, S}, Recall, BRWD_I) { + tr_allocateTBE; + vc_victim; + dc_probeInvCoreData; + d2_probeInvL2Data; + } + + transition(I, Recall) { + dd_deallocateDir; + } + + transition({BRWD_I, BRD_I}, CPUPrbResp) { + y_writeDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + pk_popResponseQueue; + } + + transition({BRWD_I, BRD_I}, TCCPrbResp) { + ty_writeTCCDataToTBE; + x_decrementAcks; + o_checkForAckCompletion; + plr_popTCCResponseQueue; + } + + transition(BRWD_I, NB_AckWB, BRD_I) { + pR_popResponseFromNBQueue; + } + + transition(BRWD_I, ProbeAcksComplete, BRW_I) { + pt_popTriggerQueue; + } + + transition(BRW_I, NB_AckWB, I) { + wb_data; + dt_deallocateTBE; + dd_deallocateDir; + pR_popResponseFromNBQueue; + } + + transition(BRD_I, ProbeAcksComplete, I) { + wb_data; + dt_deallocateTBE; + dd_deallocateDir; + pt_popTriggerQueue; + } + + // wait for stable state for Recall + transition({BRWD_I,BRD_I,BRW_I,CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W, CP_I}, Recall) { + zz_recycleRequest; // stall and wait would be for the wrong address + ut_updateTag; // try to find an easier recall + } + +} diff --git a/src/mem/protocol/GPU_RfO-TCP.sm b/src/mem/protocol/GPU_RfO-TCP.sm new file mode 100644 index 000000000..6cf9224a6 --- /dev/null +++ b/src/mem/protocol/GPU_RfO-TCP.sm @@ -0,0 +1,1009 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") + : GPUCoalescer* coalescer; + Sequencer* sequencer; + bool use_seq_not_coal; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 40; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E, AccessPermission:Read_Write, desc="Exclusive"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + M, AccessPermission:Read_Write, desc="Modified"; + + I_M, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + S_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + + ES_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack"; + MO_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for dirty WB ack"; + + MO_PI, AccessPermission:Read_Only, desc="L1 downgrade, waiting for CtoD ack (or ProbeInvalidateData)"; + + I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCC for canceled WB"; + } + + enumeration(Event, desc="TCP Events") { + // Core initiated + Load, desc="Load"; + Store, desc="Store"; + + // TCC initiated + TCC_AckS, desc="TCC Ack to Core Request"; + TCC_AckE, desc="TCC Ack to Core Request"; + TCC_AckM, desc="TCC Ack to Core Request"; + TCC_AckCtoD, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for clean WB"; + TCC_NackWB, desc="TCC Nack for clean WB"; + + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInv, desc="probe, no need for data"; + LocalPrbInv, desc="local probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCP_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCP_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCP_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + MachineType getCoherenceType(MachineID myMachID, + MachineID senderMachID) { + if(myMachID == senderMachID) { + return MachineType:TCP; + } else if(machineIDToMachineType(senderMachID) == MachineType:TCP) { + return MachineType:L1Cache_wCC; + } else if(machineIDToMachineType(senderMachID) == MachineType:TCC) { + return MachineType:TCC; + } else { + return MachineType:TCCdir; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromTCP); + out_port(responseNetwork_out, ResponseMsg, responseFromTCP); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // In Ports + + in_port(probeNetwork_in, TDProbeRequestMsg, probeToTCP) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + if(in_msg.localCtoD) { + trigger(Event:LocalPrbInv, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(responseToTCP_in, ResponseMsg, responseToTCP) { + if (responseToTCP_in.isReady(clockEdge())) { + peek(responseToTCP_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (in_msg.State == CoherenceState:Modified) { + if (in_msg.CtoD) { + trigger(Event:TCC_AckCtoD, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:TCC_AckM, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:TCC_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) { + trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + DPRINTF(RubySlicc, "%s\n", in_msg); + if (in_msg.Type == RubyRequestType:LD) { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vd_victim, "vd", desc="Victimize M/O Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Dirty := cache_entry.Dirty; + } + } + + action(vc_victim, "vc", desc="Victimize E/S Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToTCP_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, + false, MachineType:TCP); + } else { + coalescer.readCallback(address, MachineType:TCP, cache_entry.DataBlk); + } + } + + action(xl_loadDone, "xl", desc="remote load done") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPReadCallBack(machineID, in_msg.Sender); + sequencer.readCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } else { + MachineType cc_mach_type := getCoherenceType(machineID, + in_msg.Sender); + coalescer.readCallback(address, + cc_mach_type, + cache_entry.DataBlk, + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + } + + action(s_storeDone, "s", desc="local store done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPWriteCallBack(machineID, machineID); + sequencer.writeCallback(address, cache_entry.DataBlk, + false, MachineType:TCP); + } else { + coalescer.writeCallback(address, MachineType:TCP, cache_entry.DataBlk); + } + cache_entry.Dirty := true; + } + + action(xs_storeDone, "xs", desc="remote store done") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + coalescer.recordCPWriteCallBack(machineID, in_msg.Sender); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } else { + MachineType cc_mach_type := getCoherenceType(machineID, + in_msg.Sender); + coalescer.writeCallback(address, + cc_mach_type, + cache_entry.DataBlk, + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + cache_entry.Dirty := true; + } + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToTCP_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToTCP_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(piu_sendProbeResponseInvUntransferredOwnership, "piu", desc="send probe ack inv, no data, retain ownership") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + // will this always be ok? probably not for multisocket + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.UntransferredOwner :=true; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + APPEND_TRANSITION_COMMENT("Sending ack with dirty "); + APPEND_TRANSITION_COMMENT(out_msg.Dirty); + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry) || is_valid(tbe)); + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.DataBlk := getDataBlock(address); + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := cache_entry.Dirty; + } + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + APPEND_TRANSITION_COMMENT("Sending ack with dirty "); + APPEND_TRANSITION_COMMENT(out_msg.Dirty); + DPRINTF(RubySlicc, "Data is %s\n", out_msg.DataBlk); + } + } + + action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(mru_updateMRU, "mru", desc="Touch block for replacement policy") { + L1cache.setMRU(address); + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.wasValid := isValid(address); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // Transitions + + // transitions from base + transition(I, Load, I_ES) {TagArrayRead} { + a_allocate; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Store, I_M) {TagArrayRead, TagArrayWrite} { + a_allocate; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, Store, S_M) {TagArrayRead} { + mru_updateMRU; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(E, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition(O, Store, O_M) {TagArrayRead, DataArrayWrite} { + mru_updateMRU; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(M, Store) {TagArrayRead, DataArrayWrite} { + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + // simple hit transitions + transition({S, E, O, M}, Load) {TagArrayRead, DataArrayRead} { + l_loadDone; + mru_updateMRU; + p_popMandatoryQueue; + } + + // recycles from transients + transition({I_M, I_ES, ES_I, MO_I, S_M, O_M, MO_PI, I_C}, {Load, Store, Repl}) {} { + zz_recycleMandatoryQueue; + } + + transition({S, E}, Repl, ES_I) {TagArrayRead} { + t_allocateTBE; + vc_victim; + ic_invCache; + } + + transition({O, M}, Repl, MO_I) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + vd_victim; + ic_invCache; + } + + // TD event transitions + transition(I_M, {TCC_AckM, TCC_AckCtoD}, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + w_writeCache; + xs_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, TCC_AckS, S) {TagArrayWrite, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, TCC_AckE, E) {TagArrayWrite, DataArrayWrite} { + w_writeCache; + xl_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M, O_M}, TCC_AckM, M) {TagArrayWrite, DataArrayWrite} { + xs_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({MO_I, ES_I}, TCC_NackWB, I){TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + transition({MO_I, ES_I}, TCC_AckWB, I) {TagArrayWrite, DataArrayRead} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_AckWB, I) {TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(I_C, TCC_NackWB, I) {TagArrayWrite} { + d_deallocateTBE; + pr_popResponseQueue; + } + + // Probe transitions + transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I, PrbInvData) {TagArrayRead, TagArrayWrite} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition({E, S}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + // Needed for TCC-based protocols. Must hold on to ownership till transfer complete + transition({M, O}, LocalPrbInv, MO_PI){TagArrayRead, TagArrayWrite} { + piu_sendProbeResponseInvUntransferredOwnership; + pp_popProbeQueue; + } + + // If there is a race and we see a probe invalidate, handle normally. + transition(MO_PI, PrbInvData, I){TagArrayWrite} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_PI, PrbInv, I){TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + // normal exit when ownership is successfully transferred + transition(MO_PI, TCC_AckCtoD, I) {TagArrayWrite} { + ic_invCache; + pr_popResponseQueue; + } + + transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({E, S, I}, LocalPrbInv, I){TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + + transition({M, E, O}, PrbShrData, O) {TagArrayRead, TagArrayWrite, DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(MO_PI, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + + transition(S, PrbShrData, S) {TagArrayRead, DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {TagArrayRead} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition({I_M, I_ES}, {PrbInv, PrbInvData}){TagArrayRead} { + pi_sendProbeResponseInv; + ic_invCache; + a_allocate; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M, I_ES}, PrbShrData) {} { + prm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(S_M, PrbInvData, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(O_M, PrbInvData, I_M) {TagArrayRead,DataArrayRead} { + pdm_sendProbeResponseDataMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition({S_M, O_M}, {PrbInv}, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(S_M, {LocalPrbInv}, I_M) {TagArrayRead} { + pim_sendProbeResponseInvMs; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition(O_M, LocalPrbInv, I_M) {TagArrayRead} { + piu_sendProbeResponseInvUntransferredOwnership; + ic_invCache; + a_allocate; + pp_popProbeQueue; + } + + transition({S_M, O_M}, PrbShrData) {DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C){ + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {DataArrayRead} { + pd_sendProbeResponseData; + ic_invCache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) { + pi_sendProbeResponseInv; + ic_invCache; + pp_popProbeQueue; + } + + transition(ES_I, PrbShrData, ES_I) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData, MO_I) {DataArrayRead} { + pd_sendProbeResponseData; + sf_setSharedFlip; + pp_popProbeQueue; + } + +} diff --git a/src/mem/protocol/GPU_RfO.slicc b/src/mem/protocol/GPU_RfO.slicc new file mode 100644 index 000000000..7773ce6e0 --- /dev/null +++ b/src/mem/protocol/GPU_RfO.slicc @@ -0,0 +1,11 @@ +protocol "GPU_AMD_Base"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-dir.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_RfO-TCP.sm"; +include "GPU_RfO-SQC.sm"; +include "GPU_RfO-TCC.sm"; +include "GPU_RfO-TCCdir.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-RegionBuffer.sm"; diff --git a/src/mem/protocol/GPU_VIPER-SQC.sm b/src/mem/protocol/GPU_VIPER-SQC.sm new file mode 100644 index 000000000..8d5b5699a --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-SQC.sm @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:SQC, "GPU SQC (L1 I Cache)") + : Sequencer* sequencer; + CacheMemory * L1cache; + int TCC_select_num_bits; + Cycles issue_latency := 80; // time to send data down to TCC + Cycles l2_hit_latency := 18; // for 1MB L2, 20 for 2MB + + MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request"; + + MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response"; + + MessageBuffer * mandatoryQueue; +{ + state_declaration(State, desc="SQC Cache States", default="SQC_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + V, AccessPermission:Read_Only, desc="Valid"; + } + + enumeration(Event, desc="SQC Events") { + // Core initiated + Fetch, desc="Fetch"; + // Mem sys initiated + Repl, desc="Replacing block from cache"; + Data, desc="Received Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Tick clockEdge(); + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return SQC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return SQC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(SQC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromSQC); + + // In Ports + + in_port(responseToSQC_in, ResponseMsg, responseToSQC) { + if (responseToSQC_in.isReady(clockEdge())) { + peek(responseToSQC_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.addr); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + assert(in_msg.Type == RubyRequestType:IFETCH); + trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToSQC_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); + } + + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToSQC_in, ResponseMsg) { + assert(is_valid(cache_entry)); + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := false; + } + } + + // Transitions + + // transitions from base + transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { + ic_invCache + } + + transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} { + a_allocate; + w_writeCache + l_loadDone; + pr_popResponseQueue; + } + + transition(I, Fetch) {TagArrayRead, TagArrayWrite} { + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // simple hit transitions + transition(V, Fetch) {TagArrayRead, DataArrayRead} { + l_loadDone; + p_popMandatoryQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm new file mode 100644 index 000000000..f62df9f4f --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-TCC.sm @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + bool WB; /*is this cache Writeback?*/ + Cycles l2_request_latency := 50; + Cycles l2_response_latency := 20; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request"; + // To the Cores. TCC deals only with TCPs/SQCs. + MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response"; + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue; + +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="RdBlk event"; + WrVicBlk, desc="L1 Write Through"; + WrVicBlkBack, desc="L1 Write Through(dirty cache)"; + Atomic, desc="Atomic Op"; + AtomicDone, desc="AtomicOps Complete"; + AtomicNotDone, desc="AtomicOps not Complete"; + Data, desc="data messgae"; + // Coming from this TCC + L2_Repl, desc="L2 Replacement"; + // Probes + PrbInv, desc="Invalidating probe"; + // Coming from Memory Controller + WBAck, desc="writethrough ack from memory"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified(dirty cache only)"; + W, AccessPermission:Read_Write, desc="Written(dirty cache only)"; + V, AccessPermission:Read_Only, desc="Valid"; + I, AccessPermission:Invalid, desc="Invalid"; + IV, AccessPermission:Busy, desc="Waiting for Data"; + WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack"; + A, AccessPermission:Busy, desc="Invalid waiting on atomici Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + WriteMask writeMask, desc="Dirty byte mask"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + NetDest Destination, desc="Data destination"; + int numAtomics, desc="number remaining atomics"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + // Class 2: upward facing ports to GPU cores + out_port(responseToCore_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + // + // request queue going to NB + // + + +// ** IN_PORTS ** + in_port(triggerQueue_in, TiggerMsg, triggerQueue) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (tbe.numAtomics == 0) { + trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + + in_port(responseFromNB_in, ResponseMsg, responseFromNB) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + if(WB) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Unexpected Response Message to Core"); + } + } + } + } + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(sd_sendData, "sd", desc="send Shared response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(sdr_sendDataResponse, "sdr", desc="send Shared response") { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination := tbe.Destination; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + action(rd_requestData, "r", desc="Miss in L2, pass on") { + if(tbe.Destination.count()==1){ + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(swb_sendWBAck, "swb", desc="send WB Ack") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.DataBlk := in_msg.DataBlk; + } + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + cache_entry.writeMask.clear(); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + if (is_invalid(tbe)) { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Destination.clear(); + tbe.numAtomics := 0; + } + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ + tbe.Destination.add(in_msg.Requestor); + } + } + } + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + tbe.Destination.clear(); + TBEs.deallocate(address); + unset_tbe(); + } + + action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") { + peek(responseFromNB_in, ResponseMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wt_writeThrough, "wt", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(wb_writeBack, "wb", desc="write back data") { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + + action(at_atomicThrough, "at", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.Dirty := true; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(z_stall, "z", desc="stall") { + // built-in + } + + + action(ina_incrementNumAtomics, "ina", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics + 1; + } + + + action(dna_decrementNumAtomics, "dna", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics - 1; + if (tbe.numAtomics==0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AtomicDone; + } + } + } + + action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") { + triggerQueue_in.dequeue(clockEdge()); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + // transitions from base + // Assumptions for ArrayRead/Write + // TBE checked before tags + // Data Read/Write requires Tag Read + + // Stalling transitions do NOT check the tag array...and if they do, + // they can cause a resource stall deadlock! + + transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} { + z_stall; + } + transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} { + sd_sendData; + ut_updateTag; + p_popRequestQueue; + } + transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, RdBlk, IV) {TagArrayRead} { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition(IV, RdBlk) { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition({V, I},Atomic, A) {TagArrayRead} { + i_invL2; + t_allocateTBE; + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition(A, Atomic) { + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition({M, W}, Atomic, WI) {TagArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, WrVicBlk) {TagArrayRead} { + wt_writeThrough; + p_popRequestQueue; + } + + transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} { + ut_updateTag; + wdb_writeDirtyBytes; + wt_writeThrough; + p_popRequestQueue; + } + + transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + i_invL2; + } + + transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL2; + } + + transition({A, IV, WI}, L2_Repl) { + i_invL2; + } + + transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(W, PrbInv) {TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({A, IV, WI}, PrbInv) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + wcb_writeCacheBlock; + sdr_sendDataResponse; + pr_popResponseQueue; + dt_deallocateTBE; + } + + transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ar_sendAtomicResponse; + dna_decrementNumAtomics; + pr_popResponseQueue; + } + + transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} { + dt_deallocateTBE; + ptr_popTriggerQueue; + } + + transition(A, AtomicNotDone) {TagArrayRead} { + ptr_popTriggerQueue; + } + + //M,W should not see WBAck as the cache is in WB mode + //WBAcks do not need to check tags + transition({I, V, IV, A}, WBAck) { + w_sendResponseWBAck; + pr_popResponseQueue; + } + + transition(WI, WBAck,I) { + dt_deallocateTBE; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER-TCP.sm b/src/mem/protocol/GPU_VIPER-TCP.sm new file mode 100644 index 000000000..d81196b17 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER-TCP.sm @@ -0,0 +1,747 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Blake Hechtman + */ + +machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") + : VIPERCoalescer* coalescer; + Sequencer* sequencer; + bool use_seq_not_coal; + CacheMemory * L1cache; + bool WB; /*is this cache Writeback?*/ + bool disableL1; /* bypass L1 cache? */ + int TCC_select_num_bits; + Cycles issue_latency := 40; // time to send data down to TCC + Cycles l2_hit_latency := 18; + + MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request"; + MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock"; + + MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request"; + MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response"; + MessageBuffer * mandatoryQueue; + +{ + state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { + I, AccessPermission:Invalid, desc="Invalid"; + V, AccessPermission:Read_Only, desc="Valid"; + W, AccessPermission:Read_Write, desc="Written"; + M, AccessPermission:Read_Write, desc="Written and Valid"; + L, AccessPermission:Read_Write, desc="Local access is modifable"; + A, AccessPermission:Invalid, desc="Waiting on Atomic"; + } + + enumeration(Event, desc="TCP Events") { + // Core initiated + Load, desc="Load"; + Store, desc="Store to L1 (L1 is dirty)"; + StoreThrough, desc="Store directly to L2(L1 is clean)"; + StoreLocal, desc="Store to L1 but L1 is clean"; + Atomic, desc="Atomic"; + Flush, desc="Flush if dirty(wbL1 for Store Release)"; + Evict, desc="Evict if clean(invL1 for Load Acquire)"; + // Mem sys initiated + Repl, desc="Replacing block from cache"; + + // TCC initiated + TCC_Ack, desc="TCC Ack to Core Request"; + TCC_AckWB, desc="TCC Ack for WB"; + // Disable L1 cache + Bypass, desc="Bypass the entire L1 cache"; + } + + enumeration(RequestType, + desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + TagArrayFlash, desc="Flash clear the data array"; + } + + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + WriteMask writeMask, desc="written bytes masks"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + int WTcnt, default="0"; + int Fcnt, default="0"; + bool inFlush, default="false"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // Internal functions + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address)); + return cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCP_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCP_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCP_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayFlash) { + L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayFlash) { + // FIXME should check once per cache, rather than once per cacheline + return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // Out Ports + + out_port(requestNetwork_out, CPURequestMsg, requestFromTCP); + + // In Ports + + in_port(responseToTCP_in, ResponseMsg, responseToTCP) { + if (responseToTCP_in.isReady(clockEdge())) { + peek(responseToTCP_in, ResponseMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:TDSysResp) { + // disable L1 cache + if (disableL1) { + trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { + trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L1cache.cacheProbe(in_msg.addr); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck || + in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + DPRINTF(RubySlicc, "%s\n", in_msg); + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:ATOMIC) { + trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:ST) { + if(disableL1) { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } else { + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + if (in_msg.segment == HSASegment:SPILL) { + trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe); + } else if (WB) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } // end if (disableL1) + } else if (in_msg.Type == RubyRequestType:FLUSH) { + trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe); + } else if (in_msg.Type == RubyRequestType:REPLACEMENT){ + trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe); + } else { + error("Unexpected Request Message from VIC"); + if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { + if (WB) { + trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); + } + } else { + Addr victim := L1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } + } + } + + // Actions + + action(ic_invCache, "ic", desc="invalidate cache") { + if(is_valid(cache_entry)) { + cache_entry.writeMask.clear(); + L1cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(rb_bypassDone, "rb", desc="bypass L1 of read access") { + peek(responseToTCP_in, ResponseMsg) { + DataBlock tmp:= in_msg.DataBlk; + if (use_seq_not_coal) { + sequencer.readCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, tmp); + } + if(is_valid(cache_entry)) { + unset_cache_entry(); + } + } + } + + action(wab_bypassDone, "wab", desc="bypass L1 of write access") { + peek(responseToTCP_in, ResponseMsg) { + DataBlock tmp := in_msg.DataBlk; + if (use_seq_not_coal) { + sequencer.writeCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, tmp); + } + } + } + + action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") { + peek(mandatoryQueue_in, RubyRequest){ + if (cache_entry.writeMask.cmpMask(in_msg.writeMask)) { + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } else { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + } + } + + action(wt_writeThrough, "wt", desc="Flush dirty data") { + WTcnt := WTcnt + 1; + APPEND_TRANSITION_COMMENT("write++ = "); + APPEND_TRANSITION_COMMENT(WTcnt); + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(cache_entry.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + } + } + + action(at_atomicThrough, "at", desc="send Atomic") { + peek(mandatoryQueue_in, RubyRequest) { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(in_msg.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + } + } + } + + action(a_allocate, "a", desc="allocate block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L1cache.allocate(address, new Entry)); + } + cache_entry.writeMask.clear(); + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(sf_setFlush, "sf", desc="set flush") { + inFlush := true; + APPEND_TRANSITION_COMMENT(" inFlush is true"); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToTCP_in.dequeue(clockEdge()); + } + + action(l_loadDone, "l", desc="local load done") { + assert(is_valid(cache_entry)); + if (use_seq_not_coal) { + sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } + + action(s_storeDone, "s", desc="local store done") { + assert(is_valid(cache_entry)); + + if (use_seq_not_coal) { + sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + cache_entry.Dirty := true; + } + + action(inv_invDone, "inv", desc="local inv done") { + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n"); + assert(false); + } else { + coalescer.invCallback(address); + } + } + + action(wb_wbDone, "wb", desc="local wb done") { + if (inFlush == true) { + Fcnt := Fcnt + 1; + if (Fcnt > WTcnt) { + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n"); + assert(false); + } else { + coalescer.wbCallback(address); + } + Fcnt := Fcnt - 1; + } + if (WTcnt == 0 && Fcnt == 0) { + inFlush := false; + APPEND_TRANSITION_COMMENT(" inFlush is false"); + } + } + } + + action(wd_wtDone, "wd", desc="writethrough done") { + WTcnt := WTcnt - 1; + if (inFlush == true) { + Fcnt := Fcnt -1; + } + assert(WTcnt >= 0); + APPEND_TRANSITION_COMMENT("write-- = "); + APPEND_TRANSITION_COMMENT(WTcnt); + } + + action(dw_dirtyWrite, "dw", desc="update write mask"){ + peek(mandatoryQueue_in, RubyRequest) { + cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + } + } + action(w_writeCache, "w", desc="write data to cache") { + peek(responseToTCP_in, ResponseMsg) { + assert(is_valid(cache_entry)); + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask); + cache_entry.DataBlk := tmp; + } + } + + action(mru_updateMRU, "mru", desc="Touch block for replacement policy") { + L1cache.setMRU(address); + } + +// action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { +// mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); +// } + + action(z_stall, "z", desc="stall; built-in") { + // built-int action + } + + // Transitions + // ArrayRead/Write assumptions: + // All requests read Tag Array + // TBE allocation write the TagArray to I + // TBE only checked on misses + // Stores will also write dirty bits in the tag + // WriteThroughs still need to use cache entry as staging buffer for wavefront + + // Stalling transitions do NOT check the tag array...and if they do, + // they can cause a resource stall deadlock! + + transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} { + z_stall; + } + + transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} { + l_loadDone; + mru_updateMRU; + p_popMandatoryQueue; + } + + transition(I, Load) {TagArrayRead} { + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + mru_updateMRU; + at_atomicThrough; + p_popMandatoryQueue; + } + + transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} { + wt_writeThrough; + t_allocateTBE; + at_atomicThrough; + ic_invCache; + } + + transition(W, Load, I) {TagArrayRead, DataArrayRead} { + wt_writeThrough; + norl_issueRdBlkOrloadDone; + p_popMandatoryQueue; + } + + transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + p_popMandatoryQueue; + } + + transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + p_popMandatoryQueue; + } + + transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + mru_updateMRU; + s_storeDone; + p_popMandatoryQueue; + } + + //M,W should not see storeThrough + transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocate; + dw_dirtyWrite; + s_storeDone; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + dw_dirtyWrite; + s_storeDone; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { + a_allocate; + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition(I, Bypass, I) { + rb_bypassDone; + pr_popResponseQueue; + } + + transition(A, Bypass, I){ + d_deallocateTBE; + wab_bypassDone; + pr_popResponseQueue; + } + + transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} { + d_deallocateTBE; + a_allocate; + w_writeCache; + s_storeDone; + pr_popResponseQueue; + ic_invCache; + } + + transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} { + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { + w_writeCache; + l_loadDone; + pr_popResponseQueue; + } + + transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { + ic_invCache; + } + + transition({A}, Repl) {TagArrayRead, TagArrayWrite} { + ic_invCache; + } + + transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + wt_writeThrough; + ic_invCache; + } + + transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + wt_writeThrough; + ic_invCache; + } + + transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + sf_setFlush; + wt_writeThrough; + ic_invCache; + p_popMandatoryQueue; + } + + transition({V, I, A, L},Flush) {TagArrayFlash} { + sf_setFlush; + wb_wbDone; + p_popMandatoryQueue; + } + + transition({I, V}, Evict, I) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + ic_invCache; + } + + transition({W, M}, Evict, W) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + } + + transition({A, L}, Evict) {TagArrayFlash} { + inv_invDone; + p_popMandatoryQueue; + } + + // TCC_AckWB only snoops TBE + transition({V, I, A, M, W, L}, TCC_AckWB) { + wd_wtDone; + wb_wbDone; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER.slicc b/src/mem/protocol/GPU_VIPER.slicc new file mode 100644 index 000000000..45f7f3477 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER.slicc @@ -0,0 +1,9 @@ +protocol "GPU_VIPER"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-dir.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "GPU_VIPER-TCC.sm"; +include "MOESI_AMD_Base-L3cache.sm"; diff --git a/src/mem/protocol/GPU_VIPER_Baseline.slicc b/src/mem/protocol/GPU_VIPER_Baseline.slicc new file mode 100644 index 000000000..49bdce38c --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Baseline.slicc @@ -0,0 +1,9 @@ +protocol "GPU_VIPER"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-probeFilter.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "GPU_VIPER-TCC.sm"; +include "MOESI_AMD_Base-L3cache.sm"; diff --git a/src/mem/protocol/GPU_VIPER_Region-TCC.sm b/src/mem/protocol/GPU_VIPER_Region-TCC.sm new file mode 100644 index 000000000..c3aef15a3 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Region-TCC.sm @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor, Blake Hechtman + */ + +/* + * This file is inherited from GPU_VIPER-TCC.sm and retains its structure. + * There are very few modifications in this file from the original VIPER TCC + */ + +machine(MachineType:TCC, "TCC Cache") + : CacheMemory * L2cache; + bool WB; /*is this cache Writeback?*/ + int regionBufferNum; + Cycles l2_request_latency := 50; + Cycles l2_response_latency := 20; + + // From the TCPs or SQCs + MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request"; + // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC. + MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response"; + // From the NB + MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response"; + // To the NB + MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock"; + + MessageBuffer * triggerQueue, ordered="true", random="false"; +{ + // EVENTS + enumeration(Event, desc="TCC Events") { + // Requests coming from the Cores + RdBlk, desc="RdBlk event"; + WrVicBlk, desc="L1 Write Through"; + WrVicBlkBack, desc="L1 Write Back(dirty cache)"; + Atomic, desc="Atomic Op"; + AtomicDone, desc="AtomicOps Complete"; + AtomicNotDone, desc="AtomicOps not Complete"; + Data, desc="data messgae"; + // Coming from this TCC + L2_Repl, desc="L2 Replacement"; + // Probes + PrbInv, desc="Invalidating probe"; + // Coming from Memory Controller + WBAck, desc="writethrough ack from memory"; + } + + // STATES + state_declaration(State, desc="TCC State", default="TCC_State_I") { + M, AccessPermission:Read_Write, desc="Modified(dirty cache only)"; + W, AccessPermission:Read_Write, desc="Written(dirty cache only)"; + V, AccessPermission:Read_Only, desc="Valid"; + I, AccessPermission:Invalid, desc="Invalid"; + IV, AccessPermission:Busy, desc="Waiting for Data"; + WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack"; + A, AccessPermission:Busy, desc="Invalid waiting on atomic Data"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + WriteMask writeMask, desc="Dirty byte mask"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + NetDest Destination, desc="Data destination"; + int numAtomics, desc="number remaining atomics"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + MachineID getPeer(MachineID mach) { + return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum)); + } + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L2cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return TCC_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return TCC_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(TCC_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr); + } else if (request_type == RequestType:DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr); + } else if (request_type == RequestType:TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr); + } else if (request_type == RequestType:TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + + // ** OUT_PORTS ** + + // Three classes of ports + // Class 1: downward facing network links to NB + out_port(requestToNB_out, CPURequestMsg, requestToNB); + out_port(responseToNB_out, ResponseMsg, responseToNB); + out_port(unblockToNB_out, UnblockMsg, unblockToNB); + + // Class 2: upward facing ports to GPU cores + out_port(responseToCore_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + // + // request queue going to NB + // + + +// ** IN_PORTS ** + in_port(triggerQueue_in, TiggerMsg, triggerQueue) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (tbe.numAtomics == 0) { + trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + + in_port(responseFromNB_in, ResponseMsg, responseFromNB) { + if (responseFromNB_in.isReady(clockEdge())) { + peek(responseFromNB_in, ResponseMsg, block_on="addr") { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:Data, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Finally handling incoming requests (from TCP) and probes (from NB). + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + DPRINTF(RubySlicc, "%s\n", in_msg); + DPRINTF(RubySlicc, "machineID: %s\n", machineID); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } + } + + + in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) { + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + if(WB) { + if(presentOrAvail(in_msg.addr)) { + trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.addr); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Unexpected Response Message to Core"); + } + } + } + } + // BEGIN ACTIONS + + action(i_invL2, "i", desc="invalidate TCC cache block") { + if (is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + // Data available at TCC. Send the DATA to TCP + action(sd_sendData, "sd", desc="send Shared response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + // Data was not available at TCC. So, TCC forwarded the request to + // directory and directory responded back with data. Now, forward the + // DATA to TCP and send the unblock ack back to directory. + action(sdr_sendDataResponse, "sdr", desc="send Shared response") { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination := tbe.Destination; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + + action(rd_requestData, "r", desc="Miss in L2, pass on") { + if(tbe.Destination.count()==1){ + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(swb_sendWBAck, "swb", desc="send WB Ack") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.DataBlk := in_msg.DataBlk; + } + } + } + action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") { + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else { + out_msg.Dirty := false; + } + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(a_allocateBlock, "a", desc="allocate TCC block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + cache_entry.writeMask.clear(); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + if (is_invalid(tbe)) { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.Destination.clear(); + tbe.numAtomics := 0; + } + if (coreRequestNetwork_in.isReady(clockEdge())) { + peek(coreRequestNetwork_in, CPURequestMsg) { + if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ + tbe.Destination.add(in_msg.Requestor); + } + } + } + } + + action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") { + tbe.Destination.clear(); + TBEs.deallocate(address); + unset_tbe(); + } + + action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") { + peek(responseFromNB_in, ResponseMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") { + peek(coreRequestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + cache_entry.writeMask.orMask(in_msg.writeMask); + DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg); + } + } + + action(wt_writeThrough, "wt", desc="write through data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(wb_writeBack, "wb", desc="write back data") { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + + action(at_atomicThrough, "at", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:Atomic; + out_msg.Dirty := true; + out_msg.writeMask.orMask(in_msg.writeMask); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseToNB_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L2cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + coreRequestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseFromNB_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + action(zz_recycleRequestQueue, "z", desc="stall"){ + coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + action(ina_incrementNumAtomics, "ina", desc="inc num atomics") { + tbe.numAtomics := tbe.numAtomics + 1; + } + + + action(dna_decrementNumAtomics, "dna", desc="dec num atomics") { + tbe.numAtomics := tbe.numAtomics - 1; + if (tbe.numAtomics==0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AtomicDone; + } + } + } + + action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") { + triggerQueue_in.dequeue(clockEdge()); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + // transitions from base + // Assumptions for ArrayRead/Write + // TBE checked before tags + // Data Read/Write requires Tag Read + + transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} { + zz_recycleRequestQueue; + } + transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} { + sd_sendData; + ut_updateTag; + p_popRequestQueue; + } + transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + transition(I, RdBlk, IV) {TagArrayRead} { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition(IV, RdBlk) { + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + + transition({V, I},Atomic, A) {TagArrayRead} { + i_invL2; + t_allocateTBE; + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition(A, Atomic) { + at_atomicThrough; + ina_incrementNumAtomics; + p_popRequestQueue; + } + + transition({M, W}, Atomic, WI) {TagArrayRead} { + t_allocateTBE; + wb_writeBack; + } + + // Cahceblock stays in I state which implies + // this TCC is a write-no-allocate cache + transition(I, WrVicBlk) {TagArrayRead} { + wt_writeThrough; + p_popRequestQueue; + } + + transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} { + ut_updateTag; + wdb_writeDirtyBytes; + wt_writeThrough; + p_popRequestQueue; + } + + transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + swb_sendWBAck; + wdb_writeDirtyBytes; + p_popRequestQueue; + } + + transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} { + t_allocateTBE; + wb_writeBack; + i_invL2; + } + + transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL2; + } + + transition({A, IV, WI}, L2_Repl) { + i_invL2; + } + + transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(W, PrbInv) {TagArrayRead} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition({A, IV, WI}, PrbInv) { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ut_updateTag; + wcb_writeCacheBlock; + sdr_sendDataResponse; + sd2rb_sendDone2RegionBuffer; + pr_popResponseQueue; + dt_deallocateTBE; + } + + transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + a_allocateBlock; + ar_sendAtomicResponse; + sd2rb_sendDone2RegionBuffer; + dna_decrementNumAtomics; + pr_popResponseQueue; + } + + transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} { + dt_deallocateTBE; + ptr_popTriggerQueue; + } + + transition(A, AtomicNotDone) {TagArrayRead} { + ptr_popTriggerQueue; + } + + //M,W should not see WBAck as the cache is in WB mode + //WBAcks do not need to check tags + transition({I, V, IV, A}, WBAck) { + w_sendResponseWBAck; + sd2rb_sendDone2RegionBuffer; + pr_popResponseQueue; + } + + transition(WI, WBAck,I) { + sd2rb_sendDone2RegionBuffer; + dt_deallocateTBE; + pr_popResponseQueue; + } +} diff --git a/src/mem/protocol/GPU_VIPER_Region.slicc b/src/mem/protocol/GPU_VIPER_Region.slicc new file mode 100644 index 000000000..cbfef9de3 --- /dev/null +++ b/src/mem/protocol/GPU_VIPER_Region.slicc @@ -0,0 +1,11 @@ +protocol "GPU_VIPER_Region"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-Region-CorePair.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-Region-dir.sm"; +include "GPU_VIPER_Region-TCC.sm"; +include "GPU_VIPER-TCP.sm"; +include "GPU_VIPER-SQC.sm"; +include "MOESI_AMD_Base-RegionDir.sm"; +include "MOESI_AMD_Base-RegionBuffer.sm"; diff --git a/src/mem/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm new file mode 100644 index 000000000..76fe77230 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm @@ -0,0 +1,2904 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:CorePair, "CP-like Core Coherence") + : Sequencer * sequencer; + Sequencer * sequencer1; + CacheMemory * L1Icache; + CacheMemory * L1D0cache; + CacheMemory * L1D1cache; + CacheMemory * L2cache; // func mem logic looks in this CacheMemory + bool send_evictions := "False"; + Cycles issue_latency := 5; // time to send data down to NB + Cycles l2_hit_latency := 18; + + // BEGIN Core Buffers + + // To the Network + MessageBuffer * requestFromCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCore, network="To", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="4", vnet_type="unblock"; + + // From the Network + MessageBuffer * probeToCore, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="From", virtual_network="2", vnet_type="response"; + + MessageBuffer * mandatoryQueue; + + MessageBuffer * triggerQueue, ordered="true"; + + // END Core Buffers + +{ + // BEGIN STATES + state_declaration(State, desc="Cache states", default="CorePair_State_I") { + + // Base States + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership"; + E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership"; + Es, AccessPermission:Read_Write, desc="Exclusive in core"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line"; + M0, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + M1, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + + // Transient States + I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well"; + I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well"; + I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well"; + I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well"; + I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters"; + + IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive"; + IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive"; + IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills"; + IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received"; + F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received"; + + ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack"; + MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack"; + MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + S_F0, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F1, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F, AccessPermission:Read_Only, desc="Shared, filling L1"; + O_F0, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F1, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F, AccessPermission:Read_Only, desc="Owned, filling L1"; + Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache"; + Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache"; + S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response"; + S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response"; + + Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling"; + E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling"; + E1_F, AccessPermission:Read_Write, desc="..."; + E0_Es, AccessPermission:Read_Write, desc="..."; + E1_Es, AccessPermission:Read_Write, desc="..."; + Ms_F0, AccessPermission:Read_Write, desc="..."; + Ms_F1, AccessPermission:Read_Write, desc="..."; + Ms_F, AccessPermission:Read_Write, desc="..."; + M0_F, AccessPermission:Read_Write, desc="..."; + M0_Ms, AccessPermission:Read_Write, desc="..."; + M1_F, AccessPermission:Read_Write, desc="..."; + M1_Ms, AccessPermission:Read_Write, desc="..."; + + I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback"; + S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck"; + + } // END STATES + + // BEGIN EVENTS + enumeration(Event, desc="CP Events") { + // CP Initiated events + C0_Load_L1miss, desc="Cluster 0 load, L1 missed"; + C0_Load_L1hit, desc="Cluster 0 load, L1 hit"; + C1_Load_L1miss, desc="Cluster 1 load L1 missed"; + C1_Load_L1hit, desc="Cluster 1 load L1 hit"; + Ifetch0_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch1_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch0_L1miss, desc="Instruction fetch, missed in the L1"; + Ifetch1_L1miss, desc="Instruction fetch, missed in the L1"; + C0_Store_L1miss, desc="Cluster 0 store missed in L1"; + C0_Store_L1hit, desc="Cluster 0 store hit in L1"; + C1_Store_L1miss, desc="Cluster 1 store missed in L1"; + C1_Store_L1hit, desc="Cluster 1 store hit in L1"; + // NB Initiated events + NB_AckS, desc="NB Ack to Core Request"; + NB_AckM, desc="NB Ack to Core Request"; + NB_AckE, desc="NB Ack to Core Request"; + + NB_AckWB, desc="NB Ack for writeback"; + + // Memory System initiatied events + L1I_Repl, desc="Replace address from L1I"; // Presumed clean + L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean + L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean + L2_Repl, desc="Replace address from L2"; + + L2_to_L1D0, desc="L1 fill from L2"; + L2_to_L1D1, desc="L1 fill from L2"; + L2_to_L1I, desc="L1 fill from L2"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + + } // END EVENTS + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L1D0DataArrayRead, desc="Read the data array"; + L1D0DataArrayWrite, desc="Write the data array"; + L1D0TagArrayRead, desc="Read the data array"; + L1D0TagArrayWrite, desc="Write the data array"; + L1D1DataArrayRead, desc="Read the data array"; + L1D1DataArrayWrite, desc="Write the data array"; + L1D1TagArrayRead, desc="Read the data array"; + L1D1TagArrayWrite, desc="Write the data array"; + L1IDataArrayRead, desc="Read the data array"; + L1IDataArrayWrite, desc="Write the data array"; + L1ITagArrayRead, desc="Read the data array"; + L1ITagArrayWrite, desc="Write the data array"; + L2DataArrayRead, desc="Read the data array"; + L2DataArrayWrite, desc="Write the data array"; + L2TagArrayRead, desc="Read the data array"; + L2TagArrayWrite, desc="Write the data array"; + } + + + // BEGIN STRUCTURE DEFINITIONS + + + // Cache Entry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // END STRUCTURE DEFINITIONS + + // BEGIN INTERNAL FUNCTIONS + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + bool addressInCore(Addr addr) { + return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr)); + } + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + return L2cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" { + if (cluster == 0) { + Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr)); + return L1D0_entry; + } else { + Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr)); + return L1D1_entry; + } + } + + Entry getICacheEntry(Addr addr), return_by_pointer="yes" { + Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr)); + return c_entry; + } + + bool presentOrAvail2(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + bool presentOrAvailI(Addr addr) { + return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr); + } + + bool presentOrAvailD0(Addr addr) { + return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr); + } + + bool presentOrAvailD1(Addr addr) { + return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return CorePair_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return CorePair_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(CorePair_State_to_permission(state)); + } + } + + MachineType testAndClearLocalHit(Entry cache_entry) { + assert(is_valid(cache_entry)); + if (cache_entry.FromL2) { + cache_entry.FromL2 := false; + return MachineType:L2Cache; + } else { + return MachineType:L1Cache; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L1D0DataArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L2DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L2DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0DataArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + + } else { + return true; + } + } + + // END INTERNAL FUNCTIONS + + // ** OUT_PORTS ** + + out_port(requestNetwork_out, CPURequestMsg, requestFromCore); + out_port(responseNetwork_out, ResponseMsg, responseFromCore); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // ** IN_PORTS ** + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == TriggerType:L2_to_L1) { + if (in_msg.Dest == CacheId:L1I) { + trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D0) { + trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D1) { + trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe); + } else { + error("unexpected trigger dest"); + } + } + } + } + } + + + in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } + } + } + + + // ResponseNetwork + in_port(responseToCore_in, ResponseMsg, responseToCore) { + if (responseToCore_in.isReady(clockEdge())) { + peek(responseToCore_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Nothing from the Unblock Network + + // Mandatory Queue + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + if (in_msg.Type == RubyRequestType:IFETCH) { + // FETCH ACCESS + + if (L1Icache.isTagPresent(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe); + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailI(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } else { + trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } + } else { + Addr victim := L1Icache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1I_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // Not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + // DATA ACCESS + if (mod(in_msg.contextId, 2) == 1) { + if (L1D1cache.isTagPresent(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + // Stores must write through, make sure L2 avail. + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD1(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C1_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1D1_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else { + Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0); + if (is_valid(L1D0cache_entry)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD0(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C0_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1D0_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } + } + } + } + } + + + // ACTIONS + action(ii_invIcache, "ii", desc="invalidate iCache") { + if (L1Icache.isTagPresent(address)) { + L1Icache.deallocate(address); + } + } + + action(i0_invCluster, "i0", desc="invalidate cluster 0") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + } + + action(i1_invCluster, "i1", desc="invalidate cluster 1") { + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(ib_invBothClusters, "ib", desc="invalidate both clusters") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(i2_invL2, "i2", desc="invalidate L2") { + if(is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(mru_setMRU, "mru", desc="Update LRU state") { + L2cache.setMRU(address); + } + + action(mruD1_setD1cacheMRU, "mruD1", desc="Update LRU state") { + L1D1cache.setMRU(address); + } + + action(mruD0_setD0cacheMRU, "mruD0", desc="Update LRU state") { + L1D0cache.setMRU(address); + } + + action(mruI_setIcacheMRU, "mruI", desc="Update LRU state") { + L1Icache.setMRU(address); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + DPRINTF(RubySlicc,"%s\n",out_msg.Destination); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(vd_victim, "vd", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(vc_victim, "vc", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") { + if (L1D0cache.isTagPresent(address) == false) { + L1D0cache.allocateVoid(address, new Entry); + } + } + + action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") { + if (L1D1cache.isTagPresent(address) == false) { + L1D1cache.allocateVoid(address, new Entry); + } + } + + action(ai_allocateL1I, "ai", desc="Allocate L1I Block") { + if (L1Icache.isTagPresent(address) == false) { + L1Icache.allocateVoid(address, new Entry); + } + } + + action(a2_allocateL2, "a2", desc="Allocate L2 Block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToCore_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(il0_loadDone, "il0", desc="Cluster 0 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(il1_loadDone, "il1", desc="Cluster 1 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l0_loadDone, "l0", desc="Cluster 0 load done") { + Entry entry := getL1CacheEntry(address, 0); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l1_loadDone, "l1", desc="Cluster 1 load done") { + Entry entry := getL1CacheEntry(address, 1); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(xl0_loadDone, "xl0", desc="Cluster 0 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", address, l2entry.DataBlk); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xl1_loadDone, "xl1", desc="Cluster 1 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(s0_storeDone, "s0", desc="Cluster 0 store done") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + entry.Dirty := true; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(s1_storeDone, "s1", desc="Cluster 1 store done") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(xs0_storeDone, "xs0", desc="Cluster 0 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(xs1_storeDone, "xs1", desc="Cluster 1 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer.evictionCallback(address); + } + } + + action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer1.evictionCallback(address); + } + } + + action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1I; + } + } + + action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D0; + } + } + + action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D1; + } + } + + action(wi_writeIcache, "wi", desc="write data to icache (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + DPRINTF(ProtocolTrace, "CP writeD0: address %s, data: %s\n", address, in_msg.DataBlk); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + assert(addressInCore(address) || is_valid(tbe)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + if (addressInCore(address)) { + out_msg.Hit := true; + } else { + out_msg.Hit := false; + } + out_msg.Dirty := false; // not sending back data, so def. not dirty + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(l2m_profileMiss, "l2m", desc="l2m miss profile") { + ++L2cache.demand_misses; + } + + action(l10m_profileMiss, "l10m", desc="l10m miss profile") { + ++L1D0cache.demand_misses; + } + + action(l11m_profileMiss, "l11m", desc="l11m miss profile") { + ++L1D1cache.demand_misses; + } + + action(l1im_profileMiss, "l1lm", desc="l1im miss profile") { + ++L1Icache.demand_misses; + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(xx_recycleResponseQueue, "xx", desc="recycle response queue") { + responseToCore_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead,L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mruD0_setD0cacheMRU; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mruD1_setD1cacheMRU; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + a0_allocateL1D; + l10m_profileMiss; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L1ITagArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW + transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C1_Load_L1miss, E0_Es) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E0, C1_Store_L1miss, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite} { + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue CtoD + l10m_profileMiss; + a0_allocateL1D; + mruD0_setD0cacheMRU; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l11m_profileMiss; + a1_allocateL1D; + mruD1_setD1cacheMRU; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D0TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead,L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} { + a0_allocateL1D; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} { + a1_allocateL1D; + f1_L2ToL1; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + mruD0_setD0cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite} { + a1_allocateL1D; + s1_storeDone; + mruD1_setD1cacheMRU; + mru_setMRU; + p_popMandatoryQueue; + } + + // end transitions from base + + // Begin simple hit transitions + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, + Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} { + // track hits, if implemented + l0_loadDone; + mruD0_setD0cacheMRU; + p_popMandatoryQueue; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, + Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} { + // track hits, if implemented + l1_loadDone; + mruD1_setD1cacheMRU; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} { + // track hits, if implemented + il0_loadDone; + mruI_setIcacheMRU; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} { + // track hits, if implemented + il1_loadDone; + mruI_setIcacheMRU; + p_popMandatoryQueue; + } + + // end simple hit transitions + + // Transitions from transient states + + // recycles + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1, + O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0, + O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, + IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F, + O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F, + E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C, + S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1, + Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms, + M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0, + Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F, + M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1, + O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, + M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS, + PrbInvData, PrbInv, PrbShrData}) {} { + yy_recycleProbeQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} { + xx_recycleResponseQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Load_L1miss, I_M0Ms) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Load_L1miss, I_M1Ms) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Store_L1miss, I_M0M1) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Store_L1miss, I_M1M0) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + mru_setMRU; + p_popMandatoryQueue; + } + + transition(I_E0S, C1_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E1S, C0_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F0, C1_Load_L1miss, S_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) { L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} { + i0_invCluster; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} { + i1_invCluster; + } + + transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} { + ii_invIcache; + } + + transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vc_victim; + ib_invBothClusters; + i2_invL2; + ii_invIcache; + } + + transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vd_victim; + i2_invL2; + ib_invBothClusters; // nothing will happen for D0 on M1, vice versa + } + + transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckS, S_C) {L1D0DataArrayWrite,L2DataArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckS, S_C) {L1D1DataArrayWrite, L2DataArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + // THESE MO->M1 should not be instantaneous but oh well for now. + transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + i0_invCluster; + s1_storeDone; + pr_popResponseQueue; + } + + transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + i1_invCluster; + s0_storeDone; + pr_popResponseQueue; + } + + // Above shoudl be more like this, which has some latency to xfer to L1 + transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} { + w0_writeDcache; + xs0_storeDone; + uu_sendUnblock; + f1_L2ToL1; + pr_popResponseQueue; + } + + transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xs1_storeDone; + uu_sendUnblock; + f0_L2ToL1; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + mru_setMRU; + il0_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + mru_setMRU; + il1_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + mru_setMRU; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + mru_setMRU; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(IF_E0S, L2_to_L1D0, I_E0S) {} { + pt_popTriggerQueue; + } + + transition(IF_E1S, L2_to_L1D1, I_E1S) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D0, IF1_ES) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D1, IF0_ES) {} { + pt_popTriggerQueue; + } + + transition(IF0_ES, L2_to_L1D0, I_ES) {} { + pt_popTriggerQueue; + } + + transition(IF1_ES, L2_to_L1D1, I_ES) {} { + pt_popTriggerQueue; + } + + transition(F_S0, L2_to_L1I, S0) {} { + pt_popTriggerQueue; + } + + transition(F_S1, L2_to_L1I, S1) {} { + pt_popTriggerQueue; + } + + transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + mru_setMRU; + xs0_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + mru_setMRU; + xs1_storeDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; // FOO + nS_issueRdBlkS; + pr_popResponseQueue; + } + + transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; // FOO + nS_issueRdBlkS; + pr_popResponseQueue; + } + + // Writeback cancel "ack" + transition(I_C, NB_AckWB, I) {L2TagArrayWrite} { + ss_sendStaleNotification; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + transition(S_C, NB_AckWB, S) {L2TagArrayWrite} { + ss_sendStaleNotification; + pr_popResponseQueue; + } + + // Begin Probe Transitions + + transition({Ms, M0, M1, O}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + i2_invL2; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S, I}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; // only relevant for S + pp_popProbeQueue; + } + + transition(S_C, PrbInvData, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; // nothing will happen in I + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(S_C, PrbInv, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O}, PrbShrData, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S}, PrbShrData, S) {L2TagArrayRead, L2TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_C, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({I, I_C}, PrbShrData) {L2TagArrayRead} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S}, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M0) + a0_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M1, I_E1S}, {PrbInv, PrbInvData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M1) + a1_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbShrData}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + a0_allocateL1D; + a1_allocateL1D; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S, I_M1, I_E1S}, PrbShrData) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition(ES_I, PrbInvData, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {} { + pdt_sendProbeResponseDataFromTBE; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbShrData, ES_I) {} { + ph_sendProbeResponseHit; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData, MO_I) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInvData, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInvData, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + d_deallocateTBE; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({MO_S0, MO_S1}, PrbShrData) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInv}, IF_E0S) {}{ + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInv}, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F, Es_F}, {PrbInvData, PrbInv}, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition(Si_F0, {PrbInvData, PrbInv}, F_S0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(Si_F1, {PrbInvData, PrbInv}, F_S1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({Es_F0, E0_F, E1_Es}, PrbShrData, S_F0) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({Es_F1, E1_F, E0_Es}, PrbShrData, S_F1) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(Es_F, PrbShrData, S_F) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_M0, PrbInvData, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M0, PrbInvData, I_M0) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, O_M0}, {PrbInv}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(S_M1, PrbInvData, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M1, PrbInvData, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M1, O_M1}, {PrbInv}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S0, S0_C}, {PrbInvData, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S1, S1_C}, {PrbInvData, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, S_M1}, PrbShrData) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({O_M0, O_M1}, PrbShrData) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({S0, S1, S0_C, S1_C}, PrbShrData) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInvData, IF_E0S) { L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInvData, IF_E1S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInvData, IF_ES) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInv, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms}, PrbShrData, O_F0) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms}, PrbShrData, O_F1) {} { + } + + transition({Ms_F}, PrbShrData, O_F) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({O_F0, O_F1, O_F}, PrbShrData) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + // END TRANSITIONS +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-L3cache.sm b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm new file mode 100644 index 000000000..479cf4e78 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:L3Cache, "L3") + : CacheMemory * L3cache; + WireBuffer * reqToDir; + WireBuffer * respToDir; + WireBuffer * l3UnblockToDir; + WireBuffer * reqToL3; + WireBuffer * probeToL3; + WireBuffer * respToL3; + Cycles l3_request_latency := 1; + Cycles l3_response_latency := 35; + + // To the general response network + MessageBuffer * responseFromL3, network="To", virtual_network="2", ordered="false", vnet_type="response"; + + // From the general response network + MessageBuffer * responseToL3, network="From", virtual_network="2", ordered="false", vnet_type="response"; + +{ + // EVENTS + enumeration(Event, desc="L3 Events") { + // Requests coming from the Cores + RdBlk, desc="CPU RdBlk event"; + RdBlkM, desc="CPU RdBlkM event"; + RdBlkS, desc="CPU RdBlkS event"; + CtoD, desc="Change to Dirty request"; + WrVicBlk, desc="L2 Victim (dirty)"; + WrVicBlkShared, desc="L2 Victim (dirty)"; + ClVicBlk, desc="L2 Victim (clean)"; + ClVicBlkShared, desc="L2 Victim (clean)"; + + CPUData, desc="WB data from CPU"; + CPUDataShared, desc="WB data from CPU, NBReqShared 1"; + StaleWB, desc="WB stale; no data"; + + L3_Repl, desc="L3 Replacement"; + + // Probes + PrbInvData, desc="Invalidating probe, return dirty data"; + PrbInv, desc="Invalidating probe, no need to return data"; + PrbShrData, desc="Downgrading probe, return data"; + + // Coming from Memory Controller + WBAck, desc="ack from memory"; + + CancelWB, desc="Cancel WB from L2"; + } + + // STATES + // Base States: + state_declaration(State, desc="L3 State", default="L3Cache_State_I") { + M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale + O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S + E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory) + S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory + I, AccessPermission:Invalid, desc="Invalid"; + + I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data"; + I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data"; + S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M"; + S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E"; + S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S"; + E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O"; + E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O"; + E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data"; + O_M, AccessPermission:Busy, desc="..."; + O_O, AccessPermission:Busy, desc="..."; + O_E, AccessPermission:Busy, desc="..."; + O_S, AccessPermission:Busy, desc="..."; + M_M, AccessPermission:Busy, desc="..."; + M_O, AccessPermission:Busy, desc="..."; + M_E, AccessPermission:Busy, desc="..."; + M_S, AccessPermission:Busy, desc="..."; + D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive"; + MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem"; + MO_I, AccessPermission:Busy, desc="M or O, received L3_Repl, waiting for WBAck from Mem"; + I_I, AccessPermission:Busy, desc="I_MO received L3_Repl"; + I_CD, AccessPermission:Busy, desc="I_I received WBAck, now just waiting for CPUData"; + I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + // STRUCTURES + + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff from memory?)"; + DataBlock DataBlk, desc="Data for the block"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + bool Shared, desc="Victim hit by shared probe"; + MachineID From, desc="Waiting for writeback from..."; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<L3Cache_TBE>", constructor="m_number_of_TBEs"; + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + + + // FUNCTION DEFINITIONS + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", L3cache.lookup(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(addr).DataBlk; + } + + bool presentOrAvail(Addr addr) { + return L3cache.isTagPresent(addr) || L3cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return L3Cache_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return L3Cache_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(L3Cache_State_to_permission(state)); + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + return true; + } + + + // OUT PORTS + out_port(requestNetwork_out, CPURequestMsg, reqToDir); + out_port(L3Resp_out, ResponseMsg, respToDir); + out_port(responseNetwork_out, ResponseMsg, responseFromL3); + out_port(unblockNetwork_out, UnblockMsg, l3UnblockToDir); + + // IN PORTS + in_port(NBResponse_in, ResponseMsg, respToL3) { + if (NBResponse_in.isReady(clockEdge())) { + peek(NBResponse_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:WBAck, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on NBResponse Type"); + } + } + } + } + + // Response Network + in_port(responseNetwork_in, ResponseMsg, responseToL3) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUData) { + if (in_msg.NbReqShared) { + trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPUData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "%s\n", in_msg); + error("Error on NBResponse Type"); + } + } + } + } + + // probe network + in_port(probeNetwork_in, NBProbeRequestMsg, probeToL3) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.ReturnData) { + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } else { + error("Don't think I should get any of these"); + } + } + } + } + } + + // Request Network + in_port(requestNetwork_in, CPURequestMsg, reqToL3) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + assert(in_msg.Destination.isElement(machineID)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L3cache.cacheProbe(in_msg.addr); + trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Shared) { + trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else { + Addr victim := L3cache.cacheProbe(in_msg.addr); + trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } else if (in_msg.Type == CoherenceRequestType:WrCancel) { + if (is_valid(tbe) && tbe.From == in_msg.Requestor) { + trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe); + } else { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + } + } + } + } + + // BEGIN ACTIONS + + action(i_invL3, "i", desc="invalidate L3 cache block") { + if (is_valid(cache_entry)) { + L3cache.deallocate(address); + } + unset_cache_entry(); + } + + action(rm_sendResponseM, "rm", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Modified; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(rs_sendResponseS, "rs", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := cache_entry.Dirty; + out_msg.State := CoherenceState:Shared; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + + action(r_requestToMem, "r", desc="Miss in L3, pass on") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Shared := false; // unneeded for this request + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (is_valid(cache_entry)) { + tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs + tbe.Dirty := cache_entry.Dirty; + } + tbe.From := machineID; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(vd_vicDirty, "vd", desc="Victimize dirty L3 data") { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.State := CoherenceState:NA; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") { + enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:WrCancel; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(a_allocateBlock, "a", desc="allocate L3 block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L3cache.allocate(address, new Entry)); + } + } + + action(d_writeData, "d", desc="write data to L3") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + cache_entry.Dirty := in_msg.Dirty; + } + cache_entry.DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing to L3: %s\n", in_msg); + } + } + + action(rd_copyDataFromRequest, "rd", desc="write data to L3") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := true; + } + } + + action(f_setFrom, "f", desc="set who WB is expected to come from") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.From := in_msg.Requestor; + } + } + + action(rf_resetFrom, "rf", desc="reset From") { + tbe.From := machineID; + } + + action(wb_data, "wb", desc="write back data") { + enqueue(L3Resp_out, ResponseMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + } + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, l3_request_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { + L3cache.setMRU(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pn_popNBResponseQueue, "pn", desc="pop NB response queue") { + NBResponse_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "\z", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + + transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {TagArrayRead} { + r_requestToMem; + p_popRequestQueue; + } + + transition(O, RdBlk ) {TagArrayRead, DataArrayRead} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + transition(M, RdBlk, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition(S, RdBlk) {TagArrayRead, DataArrayRead} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + transition(E, RdBlk, S) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition({M, O}, RdBlkS, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition({E, S}, RdBlkS, S) {TagArrayRead, DataArrayRead, TagArrayWrite} { + rs_sendResponseS; + ut_updateTag; + p_popRequestQueue; + } + + transition(M, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + rm_sendResponseM; + i_invL3; + p_popRequestQueue; + } + + transition({O, S}, {RdBlkM, CtoD}) {TagArrayRead} { + r_requestToMem; // can't handle this, just forward + p_popRequestQueue; + } + + transition(E, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + rm_sendResponseM; + i_invL3; + p_popRequestQueue; + } + + transition({I}, WrVicBlk, I_M) {TagArrayRead, TagArrayWrite} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {} { + zz_recycleRequestQueue; + } + + transition({I}, WrVicBlkShared, I_O) {TagArrayRead, TagArrayWrite} { + a_allocateBlock; + t_allocateTBE; + f_setFrom; +// rd_copyDataFromRequest; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, WrVicBlkShared, S_O) {TagArrayRead, TagArrayWrite} { +// rd_copyDataFromRequest; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, WrVicBlk, S_M) {TagArrayRead, TagArrayWrite} { // should be technically not possible, but assume the data comes back with shared bit flipped +// rd_copyDataFromRequest; + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, WrVicBlk, E_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, WrVicBlkShared, E_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, WrVicBlk, O_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, WrVicBlkShared, O_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, WrVicBlk, M_M) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, WrVicBlkShared, M_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlk, I_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({I}, ClVicBlkShared, I_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + a_allocateBlock; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, ClVicBlk, S_E) {TagArrayRead, TagArrayWrite} { // technically impossible, assume data comes back with shared bit flipped + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(S, ClVicBlkShared, S_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, ClVicBlk, E_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(E, ClVicBlkShared, E_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, ClVicBlk, O_E) {TagArrayRead, TagArrayWrite} { // technically impossible, but assume data comes back with shared bit flipped + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(O, ClVicBlkShared, O_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, ClVicBlk, M_E) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(M, ClVicBlkShared, M_S) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {} { + r_requestToMem; + p_popRequestQueue; + } + + transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {TagArrayWrite} { + f_setFrom; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(I_M, CPUData, M) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUData, E) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(I_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + pr_popResponseQueue; + } + + transition(S_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(S_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_E, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition(O_S, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + d_writeData; + ut_updateTag; // update tag on writeback hits. + pr_popResponseQueue; + } + + transition({D_I}, {CPUData, CPUDataShared}, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + pr_popResponseQueue; + } + + transition(I_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite, DataArrayRead} { + uu_sendUnblock; + wt_writeDataToTBE; + rf_resetFrom; + pr_popResponseQueue; + } + + transition(I_CD, {CPUData, CPUDataShared}, I) {DataArrayRead, TagArrayWrite} { + uu_sendUnblock; + wt_writeDataToTBE; + wb_data; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition({M, O}, L3_Repl, MO_I) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + vd_vicDirty; + i_invL3; + } + + transition({E, S,}, L3_Repl, I) {TagArrayRead, TagArrayWrite} { + i_invL3; + } + + transition({I_M, I_O, S_M, S_O, E_M, E_O}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({O_M, O_O, O_E, O_S, M_M, M_O, M_E, M_S}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({I_E, I_S, S_E, S_S, E_E, E_S}, L3_Repl) {} { + zz_recycleRequestQueue; + } + + transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + pd_sendProbeResponseData; + i_invL3; + pp_popProbeQueue; + } + + transition({E, S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL3; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} { + pi_sendProbeResponseInv; + i_invL3; // nothing will happen in I + pp_popProbeQueue; + } + + transition({M, O}, PrbShrData, O) {TagArrayRead, DataArrayRead, TagArrayWrite} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({E, S}, PrbShrData, S) {TagArrayRead, TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(I, PrbShrData) {TagArrayRead} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(MO_I, PrbInvData, I_C) {TagArrayWrite, DataArrayRead} { + pdt_sendProbeResponseDataFromTBE; + mc_cancelMemWriteback; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {TagArrayWrite} { + pi_sendProbeResponseInv; + mc_cancelMemWriteback; + pp_popProbeQueue; + } + + transition(MO_I, PrbShrData) {DataArrayRead} { + pdt_sendProbeResponseDataFromTBE; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInv}) {} { + pi_sendProbeResponseInv; + pp_popProbeQueue; + } + + transition(I_C, PrbShrData) {} { + pm_sendProbeResponseMiss; + pp_popProbeQueue; + } + + transition(I_I, {WBAck}, I_CD) {TagArrayWrite} { + pn_popNBResponseQueue; + } + + transition(MOD_I, WBAck, D_I) {DataArrayRead} { + wb_data; + pn_popNBResponseQueue; + } + + transition(MO_I, WBAck, I) {DataArrayRead, TagArrayWrite} { + wb_data; + dt_deallocateTBE; + pn_popNBResponseQueue; + } + + transition(I_C, {WBAck}, I) {TagArrayWrite} { + dt_deallocateTBE; + pn_popNBResponseQueue; + } + + transition({I_M, I_O, I_E, I_S}, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + i_invL3; + p_popRequestQueue; + } + + transition({S_S, S_O, S_M, S_E}, CancelWB, S) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({E_M, E_O, E_E, E_S}, CancelWB, E) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({O_M, O_O, O_E, O_S}, CancelWB, O) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition({M_M, M_O, M_E, M_S}, CancelWB, M) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition(D_I, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + p_popRequestQueue; + } + + transition(MOD_I, CancelWB, MO_I) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + p_popRequestQueue; + } + + transition(I_I, CancelWB, I_C) {TagArrayWrite} { + uu_sendUnblock; + rf_resetFrom; + mc_cancelMemWriteback; + p_popRequestQueue; + } + + transition(I_CD, CancelWB, I) {TagArrayWrite} { + uu_sendUnblock; + dt_deallocateTBE; + mc_cancelMemWriteback; + p_popRequestQueue; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm new file mode 100644 index 000000000..fd84447a2 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm @@ -0,0 +1,3009 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:CorePair, "CP-like Core Coherence") + : Sequencer * sequencer; + Sequencer * sequencer1; + CacheMemory * L1Icache; + CacheMemory * L1D0cache; + CacheMemory * L1D1cache; + CacheMemory * L2cache; + int regionBufferNum; + bool send_evictions := "False"; + Cycles issue_latency := 5; + Cycles l2_hit_latency := 18; + + // BEGIN Core Buffers + + // To the Network + MessageBuffer * requestFromCore, network="To", virtual_network="0", ordered="true", vnet_type="request"; + MessageBuffer * responseFromCore, network="To", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCore, network="To", virtual_network="4", ordered="false", vnet_type="unblock"; + + // From the Network + MessageBuffer * probeToCore, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToCore, network="From", virtual_network="2", ordered="false", vnet_type="response"; + + MessageBuffer * mandatoryQueue, ordered="false"; + MessageBuffer * triggerQueue, ordered="true"; + + // END Core Buffers + +{ + // BEGIN STATES + state_declaration(State, desc="Cache states", default="CorePair_State_I") { + + I, AccessPermission:Invalid, desc="Invalid"; + S, AccessPermission:Read_Only, desc="Shared"; + E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership"; + E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership"; + Es, AccessPermission:Read_Write, desc="Exclusive in core"; + O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line"; + Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line"; + M0, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + M1, AccessPermission:Read_Write, desc="Modified with cluster ownership"; + + // Transient States + I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet"; + I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well"; + I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well"; + I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well"; + I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well"; + I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet"; + I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters"; + + IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive"; + IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive"; + IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills"; + IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one"; + F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received"; + F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received"; + + ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack"; + MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack"; + MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS"; + S_F0, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F1, AccessPermission:Read_Only, desc="Shared, filling L1"; + S_F, AccessPermission:Read_Only, desc="Shared, filling L1"; + O_F0, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F1, AccessPermission:Read_Only, desc="Owned, filling L1"; + O_F, AccessPermission:Read_Only, desc="Owned, filling L1"; + Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache"; + Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache"; + S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet"; + S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response"; + S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response"; + + Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling"; + Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling"; + E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling"; + E1_F, AccessPermission:Read_Write, desc="..."; + E0_Es, AccessPermission:Read_Write, desc="..."; + E1_Es, AccessPermission:Read_Write, desc="..."; + Ms_F0, AccessPermission:Read_Write, desc="..."; + Ms_F1, AccessPermission:Read_Write, desc="..."; + Ms_F, AccessPermission:Read_Write, desc="..."; + M0_F, AccessPermission:Read_Write, desc="..."; + M0_Ms, AccessPermission:Read_Write, desc="..."; + M1_F, AccessPermission:Read_Write, desc="..."; + M1_Ms, AccessPermission:Read_Write, desc="..."; + + I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback"; + S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB"; + S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck"; + + } // END STATES + + // BEGIN EVENTS + enumeration(Event, desc="CP Events") { + // CP Initiated events + C0_Load_L1miss, desc="Cluster 0 load, L1 missed"; + C0_Load_L1hit, desc="Cluster 0 load, L1 hit"; + C1_Load_L1miss, desc="Cluster 1 load L1 missed"; + C1_Load_L1hit, desc="Cluster 1 load L1 hit"; + Ifetch0_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch1_L1hit, desc="Instruction fetch, hit in the L1"; + Ifetch0_L1miss, desc="Instruction fetch, missed in the L1"; + Ifetch1_L1miss, desc="Instruction fetch, missed in the L1"; + C0_Store_L1miss, desc="Cluster 0 store missed in L1"; + C0_Store_L1hit, desc="Cluster 0 store hit in L1"; + C1_Store_L1miss, desc="Cluster 1 store missed in L1"; + C1_Store_L1hit, desc="Cluster 1 store hit in L1"; + // NB Initiated events + NB_AckS, desc="NB Ack to Core Request"; + NB_AckM, desc="NB Ack to Core Request"; + NB_AckE, desc="NB Ack to Core Request"; + + NB_AckWB, desc="NB Ack for writeback"; + + // Memory System initiatied events + L1I_Repl, desc="Replace address from L1I"; // Presumed clean + L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean + L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean + L2_Repl, desc="Replace address from L2"; + + L2_to_L1D0, desc="L1 fill from L2"; + L2_to_L1D1, desc="L1 fill from L2"; + L2_to_L1I, desc="L1 fill from L2"; + + // Probe Events + PrbInvData, desc="probe, return O or M data"; + PrbInvDataDemand, desc="probe, return O or M data. Demand request"; + PrbInv, desc="probe, no need for data"; + PrbShrData, desc="probe downgrade, return O or M data"; + PrbShrDataDemand, desc="probe downgrade, return O or M data. Demand request"; + ForceRepl, desc="probe from r-buf. Act as though a repl"; + ForceDowngrade, desc="probe from r-buf. Act as though a repl"; + + } // END EVENTS + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L1D0DataArrayRead, desc="Read the data array"; + L1D0DataArrayWrite, desc="Write the data array"; + L1D0TagArrayRead, desc="Read the data array"; + L1D0TagArrayWrite, desc="Write the data array"; + L1D1DataArrayRead, desc="Read the data array"; + L1D1DataArrayWrite, desc="Write the data array"; + L1D1TagArrayRead, desc="Read the data array"; + L1D1TagArrayWrite, desc="Write the data array"; + L1IDataArrayRead, desc="Read the data array"; + L1IDataArrayWrite, desc="Write the data array"; + L1ITagArrayRead, desc="Read the data array"; + L1ITagArrayWrite, desc="Write the data array"; + L2DataArrayRead, desc="Read the data array"; + L2DataArrayWrite, desc="Write the data array"; + L2TagArrayRead, desc="Read the data array"; + L2TagArrayWrite, desc="Write the data array"; + } + + + // BEGIN STRUCTURE DEFINITIONS + + + // Cache Entry + structure(Entry, desc="...", interface="AbstractCacheEntry") { + State CacheState, desc="cache state"; + bool Dirty, desc="Is the data dirty (diff than memory)?"; + DataBlock DataBlk, desc="data for the block"; + bool FromL2, default="false", desc="block just moved from L2"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block, required for concurrent writebacks"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; + bool Shared, desc="Victim hit by shared probe"; + bool AckNeeded, desc="True if need to ack r-dir"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + // END STRUCTURE DEFINITIONS + + // BEGIN INTERNAL FUNCTIONS + + MachineID getPeer(MachineID mach) { + return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum)); + } + + bool addressInCore(Addr addr) { + return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr)); + } + + Entry getCacheEntry(Addr address), return_by_pointer="yes" { + Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address)); + return L2cache_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return tbe.DataBlk; + } else { + return getCacheEntry(addr).DataBlk; + } + } + + Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" { + if (cluster == 0) { + Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr)); + return L1D0_entry; + } else { + Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr)); + return L1D1_entry; + } + } + + Entry getICacheEntry(Addr addr), return_by_pointer="yes" { + Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr)); + return c_entry; + } + + bool presentOrAvail2(Addr addr) { + return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr); + } + + bool presentOrAvailI(Addr addr) { + return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr); + } + + bool presentOrAvailD0(Addr addr) { + return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr); + } + + bool presentOrAvailD1(Addr addr) { + return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr); + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if(is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.CacheState; + } + return State:I; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + + if (is_valid(cache_entry)) { + cache_entry.CacheState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + return CorePair_State_to_permission(tbe.TBEState); + } + + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return CorePair_State_to_permission(cache_entry.CacheState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + bool isValid(Addr addr) { + AccessPermission perm := getAccessPermission(addr); + if (perm == AccessPermission:NotPresent || + perm == AccessPermission:Invalid || + perm == AccessPermission:Busy) { + return false; + } else { + return true; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(CorePair_State_to_permission(state)); + } + } + + MachineType testAndClearLocalHit(Entry cache_entry) { + assert(is_valid(cache_entry)); + if (cache_entry.FromL2) { + cache_entry.FromL2 := false; + return MachineType:L2Cache; + } else { + return MachineType:L1Cache; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L1D0DataArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:L2DataArrayRead) { + L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L2DataArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2DataArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L2TagArrayRead) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L2TagArrayWrite) { + return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0DataArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0DataArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D0TagArrayRead) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D0TagArrayWrite) { + return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1DataArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1DataArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1D1TagArrayRead) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1D1TagArrayWrite) { + return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1IDataArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1IDataArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L1ITagArrayRead) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L1ITagArrayWrite) { + return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + return true; + } + } + + // END INTERNAL FUNCTIONS + + // ** OUT_PORTS ** + + out_port(requestNetwork_out, CPURequestMsg, requestFromCore); + out_port(responseNetwork_out, ResponseMsg, responseFromCore); + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(unblockNetwork_out, UnblockMsg, unblockFromCore); + + // ** IN_PORTS ** + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == TriggerType:L2_to_L1) { + if (in_msg.Dest == CacheId:L1I) { + trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D0) { + trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Dest == CacheId:L1D1) { + trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe); + } else { + error("unexpected trigger dest"); + } + } + } + } + } + + + in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == ProbeRequestType:PrbInv) { + if (in_msg.DemandRequest) { + trigger(Event:PrbInvDataDemand, in_msg.addr, cache_entry, tbe); + } else if (in_msg.ReturnData) { + trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + if (in_msg.DemandRequest) { + trigger(Event:PrbShrDataDemand, in_msg.addr, cache_entry, tbe); + } else { + assert(in_msg.ReturnData); + trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == ProbeRequestType:PrbRepl) { + trigger(Event:ForceRepl, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == ProbeRequestType:PrbRegDowngrade) { + trigger(Event:ForceDowngrade, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown probe request"); + } + } + } + } + + + // ResponseNetwork + in_port(responseToCore_in, ResponseMsg, responseToCore) { + if (responseToCore_in.isReady(clockEdge())) { + peek(responseToCore_in, ResponseMsg, block_on="addr") { + + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := TBEs.lookup(in_msg.addr); + + if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.State == CoherenceState:Modified) { + trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Shared) { + trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe); + } else if (in_msg.State == CoherenceState:Exclusive) { + trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) { + trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe); + } else { + error("Unexpected Response Message to Core"); + } + } + } + } + + // Nothing from the Unblock Network + + // Mandatory Queue + in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") { + if (mandatoryQueue_in.isReady(clockEdge())) { + peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { + + Entry cache_entry := getCacheEntry(in_msg.LineAddress); + TBE tbe := TBEs.lookup(in_msg.LineAddress); + + if (in_msg.Type == RubyRequestType:IFETCH) { + // FETCH ACCESS + + if (L1Icache.isTagPresent(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe); + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailI(in_msg.LineAddress)) { + if (mod(in_msg.contextId, 2) == 0) { + trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } else { + trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry, + tbe); + } + } else { + Addr victim := L1Icache.cacheProbe(in_msg.LineAddress); + trigger(Event:L1I_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // Not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(0) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + // DATA ACCESS + if (mod(in_msg.contextId, 2) == 1) { + if (L1D1cache.isTagPresent(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + // Stores must write through, make sure L2 avail. + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(1) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD1(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C1_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C1_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L1D1_Repl is %s\n", in_msg.LineAddress, victim); + trigger(Event:L1D1_Repl, victim, + getCacheEntry(victim), TBEs.lookup(victim)); + } + } else { // not present or avail in L2 + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(2) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); + } + } + } else { + Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0); + if (is_valid(L1D0cache_entry)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry, + tbe); + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(3) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } else { + if (presentOrAvail2(in_msg.LineAddress)) { + if (presentOrAvailD0(in_msg.LineAddress)) { + if (in_msg.Type == RubyRequestType:LD) { + trigger(Event:C0_Load_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } else { + trigger(Event:C0_Store_L1miss, in_msg.LineAddress, + cache_entry, tbe); + } + } else { + Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L1D0_Repl is %s\n", in_msg.LineAddress, victim); + trigger(Event:L1D0_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } else { + Addr victim := L2cache.cacheProbe(in_msg.LineAddress); + DPRINTF(RubySlicc, "Victim for %s L2_Repl(4) is %s\n", in_msg.LineAddress, victim); + trigger(Event:L2_Repl, victim, getCacheEntry(victim), + TBEs.lookup(victim)); + } + } + } + } + } + } + } + + + // ACTIONS + action(ii_invIcache, "ii", desc="invalidate iCache") { + if (L1Icache.isTagPresent(address)) { + L1Icache.deallocate(address); + } + } + + action(i0_invCluster, "i0", desc="invalidate cluster 0") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + } + + action(i1_invCluster, "i1", desc="invalidate cluster 1") { + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(ib_invBothClusters, "ib", desc="invalidate both clusters") { + if (L1D0cache.isTagPresent(address)) { + L1D0cache.deallocate(address); + } + if (L1D1cache.isTagPresent(address)) { + L1D1cache.deallocate(address); + } + } + + action(i2_invL2, "i2", desc="invalidate L2") { + if(is_valid(cache_entry)) { + L2cache.deallocate(address); + } + unset_cache_entry(); + } + + action(n_issueRdBlk, "n", desc="Issue RdBlk") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlk; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nMs_issueRdBlkMSinked, "nMs", desc="Issue RdBlkM with CtoDSinked") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkM; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.CtoDSinked := true; + } + } + + action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := curCycle(); + } + } + + action(nSs_issueRdBlkSSinked, "nSs", desc="Issue RdBlkS with CtoDSinked") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:RdBlkS; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.CtoDSinked := true; + out_msg.MessageSize := MessageSizeType:Request_Control; + } + } + + action(vd_victim, "vd", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + action(vc_victim, "vc", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + } + } + + // Could send these two directly to dir if we made a new out network on channel 0 + action(vdf_victimForce, "vdf", desc="Victimize M/O L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicDirty; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:O) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Private := true; + } + } + + action(vcf_victimForce, "vcf", desc="Victimize E/S L2 Data") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Type := CoherenceRequestType:VicClean; + out_msg.InitialRequestTime := curCycle(); + if (cache_entry.CacheState == State:S) { + out_msg.Shared := true; + } else { + out_msg.Shared := false; + } + out_msg.Private := true; + } + } + + action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") { + if (L1D0cache.isTagPresent(address) == false) { + L1D0cache.allocateVoid(address, new Entry); + } + } + + action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") { + if (L1D1cache.isTagPresent(address) == false) { + L1D1cache.allocateVoid(address, new Entry); + } + } + + action(ai_allocateL1I, "ai", desc="Allocate L1I Block") { + if (L1Icache.isTagPresent(address) == false) { + L1Icache.allocateVoid(address, new Entry); + } + } + + action(a2_allocateL2, "a2", desc="Allocate L2 Block") { + if (is_invalid(cache_entry)) { + set_cache_entry(L2cache.allocate(address, new Entry)); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + assert(is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs + tbe.Dirty := cache_entry.Dirty; + tbe.Shared := false; + } + + action(d_deallocateTBE, "d", desc="Deallocate TBE") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { + mandatoryQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop Response Queue") { + responseToCore_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="pop probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(il0_loadDone, "il0", desc="Cluster 0 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(il1_loadDone, "il1", desc="Cluster 1 i load done") { + Entry entry := getICacheEntry(address); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l0_loadDone, "l0", desc="Cluster 0 load done") { + Entry entry := getL1CacheEntry(address, 0); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(l1_loadDone, "l1", desc="Cluster 1 load done") { + Entry entry := getL1CacheEntry(address, 1); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + assert(is_valid(entry)); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + sequencer1.readCallback(address, + l2entry.DataBlk, + true, + testAndClearLocalHit(entry)); + } + + action(xl0_loadDone, "xl0", desc="Cluster 0 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", + address, l2entry.DataBlk); + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xl1_loadDone, "xl1", desc="Cluster 1 load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") { + peek(responseToCore_in, ResponseMsg) { + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + Entry l2entry := getCacheEntry(address); // Used for functional accesses + // L2 supplies data (functional accesses only look in L2, ok because L1 + // writes through to L2) + assert(is_valid(l2entry)); + sequencer1.readCallback(address, + l2entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + } + } + + action(s0_storeDone, "s0", desc="Cluster 0 store done") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + entry.Dirty := true; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(s1_storeDone, "s1", desc="Cluster 1 store done") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + true, + testAndClearLocalHit(entry)); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + + action(xs0_storeDone, "xs0", desc="Cluster 0 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(xs1_storeDone, "xs1", desc="Cluster 1 store done") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) || + (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache)); + sequencer1.writeCallback(address, + cache_entry.DataBlk, + false, + machineIDToMachineType(in_msg.Sender), + in_msg.InitialRequestTime, + in_msg.ForwardRequestTime, + in_msg.ProbeRequestStartTime); + cache_entry.Dirty := true; + entry.Dirty := true; + entry.DataBlk := cache_entry.DataBlk; + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + } + } + + action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer.evictionCallback(address); + } + } + + action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address); + sequencer1.evictionCallback(address); + } + } + + action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:StaleNotif; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.Dirty := cache_entry.Dirty; + entry.DataBlk := cache_entry.DataBlk; + entry.FromL2 := true; + } + + action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1I; + } + } + + action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D0; + } + } + + action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") { + enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L2_to_L1; + out_msg.Dest := CacheId:L1D1; + } + } + + action(wi_writeIcache, "wi", desc="write data to icache (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getICacheEntry(address); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 0); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") { + peek(responseToCore_in, ResponseMsg) { + Entry entry := getL1CacheEntry(address, 1); + assert(is_valid(entry)); + assert(is_valid(cache_entry)); + entry.DataBlk := in_msg.DataBlk; + entry.Dirty := in_msg.Dirty; + cache_entry.DataBlk := in_msg.DataBlk; + cache_entry.Dirty := in_msg.Dirty; + } + } + + action(wb_data, "wb", desc="write back data") { + peek(responseToCore_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUData; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + if (tbe.Shared) { + out_msg.NbReqShared := true; + } else { + out_msg.NbReqShared := false; + } + out_msg.State := CoherenceState:Shared; // faux info + out_msg.MessageSize := MessageSizeType:Writeback_Data; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Hit := false; + out_msg.Ntsl := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; + out_msg.Ntsl := true; + out_msg.Hit := false; + APPEND_TRANSITION_COMMENT("Setting Ms"); + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + assert(addressInCore(address) || is_valid(tbe)); + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := true; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + if (addressInCore(address)) { + out_msg.Hit := true; + } else { + out_msg.Hit := false; + } + out_msg.Dirty := false; // not sending back data, so def. not dirty + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.isValid := isValid(address); + } + } + + action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.DataBlk := cache_entry.DataBlk; + assert(cache_entry.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + APPEND_TRANSITION_COMMENT("Setting Ms"); + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.DataBlk := tbe.DataBlk; + assert(tbe.Dirty); + out_msg.Dirty := true; + out_msg.Hit := true; + out_msg.State := CoherenceState:NA; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.isValid := isValid(address); + } + } + + action(ra_sendReplAck, "ra", desc="Send ack to r-buf that line is replaced if needed") { + if (is_invalid(tbe) || tbe.AckNeeded) { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:InvAck; + out_msg.Requestor := machineID; + out_msg.Destination.add(getPeer(machineID)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + APPEND_TRANSITION_COMMENT(" Sending ack to r-buf "); + } else { + APPEND_TRANSITION_COMMENT(" NOT Sending ack to r-buf "); + } + } + + action(m_markAckNeeded, "m", desc="Mark TBE to send ack when deallocated") { + assert(is_valid(tbe)); + tbe.AckNeeded := true; + } + + action(mc_cancelWB, "mc", desc="send writeback cancel to L3") { + enqueue(responseNetwork_out, ResponseMsg, issue_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:CPUCancelWB; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") { + assert(is_valid(tbe)); + tbe.Shared := true; + } + + action(uu_sendUnblock, "uu", desc="state changed, unblock") { + enqueue(unblockNetwork_out, UnblockMsg, issue_latency) { + out_msg.addr := address; + out_msg.Destination.add(map_Address_to_Directory(address)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + out_msg.wasValid := isValid(address); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sdv_sendDoneValid, "sdv", desc="Request finished, send done ack") { + enqueue(unblockNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else if (is_valid(cache_entry)) { + out_msg.Dirty := cache_entry.Dirty; + } else { + out_msg.Dirty := false; + } + out_msg.validToInvalid := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(sdi_sendDoneInvalid, "sdi", desc="Request finished, send done ack") { + enqueue(unblockNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(getPeer(machineID)); + out_msg.DoneAck := true; + out_msg.MessageSize := MessageSizeType:Unblock_Control; + if (is_valid(tbe)) { + out_msg.Dirty := tbe.Dirty; + } else if (is_valid(cache_entry)) { + out_msg.Dirty := cache_entry.Dirty; + } else { + out_msg.Dirty := false; + } + out_msg.validToInvalid := true; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(l10m_profileMiss, "l10m", desc="l10m miss profile") { + ++L1D0cache.demand_misses; + } + + action(l11m_profileMiss, "l11m", desc="l11m miss profile") { + ++L1D1cache.demand_misses; + } + + action(l1im_profileMiss, "l1lm", desc="l1im miss profile") { + ++L1Icache.demand_misses; + } + + action(l2m_profileMiss, "l2m", desc="l2m miss profile") { + ++L2cache.demand_misses; + } + + action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") { + mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + // END ACTIONS + + // BEGIN TRANSITIONS + + // transitions from base + transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + l1im_profileMiss; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // since in I state, L2 miss as well + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + n_issueRdBlk; + p_popMandatoryQueue; + } + + transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} { + // track misses, if implemented + // L2 miss as well + l10m_profileMiss; + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l11m_profileMiss; + // track misses, if implemented + // L2 miss as well + l2m_profileMiss; + l1im_profileMiss; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + a2_allocateL2; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + a2_allocateL2; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + p_popMandatoryQueue; + } + + transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l1im_profileMiss; + ai_allocateL1I; + fi_L2ToL1; + p_popMandatoryQueue; + } + + transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead}{ + l2m_profileMiss; + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead,L2TagArrayRead} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F? + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} { + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + ib_invBothClusters; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW + transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayWrite,L1D0TagArrayRead, L2TagArrayRead, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay + p_popMandatoryQueue; + } + + transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(E0, C1_Load_L1miss, E0_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead } { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i0_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(E0, C1_Store_L1miss, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + a1_allocateL1D; + l11m_profileMiss; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + a1_allocateL1D; + l11m_profileMiss; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + a0_allocateL1D; + l10m_profileMiss; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead,L1ITagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l1im_profileMiss; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + i1_invCluster; + nS_issueRdBlkS; + p_popMandatoryQueue; + } + + transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} { + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue CtoD + l10m_profileMiss; + a0_allocateL1D; + i1_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss, still issue RdBlkS + l11m_profileMiss; + a1_allocateL1D; + i0_invCluster; + ii_invIcache; + nM_issueRdBlkM; + p_popMandatoryQueue; + } + + transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2TagArrayRead} { + l2m_profileMiss; // permissions miss + l1im_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead L2TagArrayRead } { + l2m_profileMiss; // permissions miss + l10m_profileMiss; + ai_allocateL1I; + t_allocateTBE; + ib_invBothClusters; + vd_victim; +// i2_invL2; + p_popMandatoryQueue; + } + + transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D1TagArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} { + a0_allocateL1D; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + a1_allocateL1D; + i0_invCluster; + s1_storeDone; + p_popMandatoryQueue; + } + + transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead L2TagArrayRead, L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} { + a0_allocateL1D; + i1_invCluster; + s0_storeDone; + p_popMandatoryQueue; + } + + transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite} { + a1_allocateL1D; + s1_storeDone; + p_popMandatoryQueue; + } + + // end transitions from base + + // Begin simple hit transitions + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, + Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} { + // track hits, if implemented + l0_loadDone; + p_popMandatoryQueue; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, + Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} { + // track hits, if implemented + l1_loadDone; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} { + // track hits, if implemented + il0_loadDone; + p_popMandatoryQueue; + } + + transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} { + // track hits, if implemented + il1_loadDone; + p_popMandatoryQueue; + } + + // end simple hit transitions + + // Transitions from transient states + + // recycles + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1, + O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0, + O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, + IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F, + O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F, + E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C, + S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1, + Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms, + M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0, + Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F, + M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1, + O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, + M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES, + IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F, + E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES, + IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F, + E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} { + zz_recycleMandatoryQueue; + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS, + PrbInvData, PrbInvDataDemand, PrbInv, PrbShrData, PrbShrDataDemand}) {} { + zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} { + zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary. + } + + transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} { + l10m_profileMiss; + a0_allocateL1D; + f0_L2ToL1; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Load_L1miss, I_M0Ms){ + l11m_profileMiss; + l2m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Load_L1miss, I_M1Ms){ + l10m_profileMiss; + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M0, C1_Store_L1miss, I_M0M1) { + l11m_profileMiss; + l2m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_M1, C0_Store_L1miss, I_M1M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite} { + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E0S, C1_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l11m_profileMiss; + a1_allocateL1D; + p_popMandatoryQueue; + } + + transition(I_E1S, C0_Load_L1miss, I_ES) {} { + l2m_profileMiss; + l10m_profileMiss; + l2m_profileMiss; + a0_allocateL1D; + p_popMandatoryQueue; + } + + transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(S_F0, C1_Load_L1miss, S_F) { L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) {L2DataArrayRead} { + l11m_profileMiss; + a1_allocateL1D; + f1_L2ToL1; + p_popMandatoryQueue; + } + + transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} { + i0_invCluster; + } + + transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} { + i1_invCluster; + } + + transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} { + ii_invIcache; + } + + transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead,L1D0TagArrayRead, L1D1TagArrayRead, L1ITagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vc_victim; + ib_invBothClusters; + i2_invL2; + ii_invIcache; + } + + transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayRead, L1D1TagArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + t_allocateTBE; + vd_victim; + i2_invL2; + ib_invBothClusters; // nothing will happen for D0 on M1, vice versa + } + + transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + wi_writeIcache; + xi1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckS, S_C) { L1IDataArrayWrite,L2DataArrayWrite} { + // does not need send done since the rdblks was "sinked" + wi_writeIcache; + xi0_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckS, S_C) { L1D1DataArrayWrite,L2DataArrayWrite} { + wi_writeIcache; + xi1_loadDone; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M0, NB_AckM, M0) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + // THESE MO->M1 should not be instantaneous but oh well for now. + transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + i0_invCluster; + s1_storeDone; + pr_popResponseQueue; + } + + transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + i1_invCluster; + s0_storeDone; + pr_popResponseQueue; + } + + // Above shoudl be more like this, which has some latency to xfer to L1 + transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} { + w0_writeDcache; + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + f1_L2ToL1; + pr_popResponseQueue; + } + + transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite,L2DataArrayWrite} { + w1_writeDcache; + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + f0_L2ToL1; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + w0_writeDcache; + xl0_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} { + w0_writeDcache; + xl0_loadDone; + w1_writeDcache; + xl1_loadDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + il0_loadDone; + pt_popTriggerQueue; + } + + transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + ci_copyL2ToL1; + il1_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} { + c0_copyL2ToL1; + l0_loadDone; + pt_popTriggerQueue; + } + + transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} { + c1_copyL2ToL1; + l1_loadDone; + pt_popTriggerQueue; + } + + transition(IF_E0S, L2_to_L1D0, I_E0S) {} { + pt_popTriggerQueue; + } + + transition(IF_E1S, L2_to_L1D1, I_E1S) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D0, IF1_ES) {} { + pt_popTriggerQueue; + } + + transition(IF_ES, L2_to_L1D1, IF0_ES) {} { + pt_popTriggerQueue; + } + + transition(IF0_ES, L2_to_L1D0, I_ES) {} { + pt_popTriggerQueue; + } + + transition(IF1_ES, L2_to_L1D1, I_ES) {} { + pt_popTriggerQueue; + } + + transition(F_S0, L2_to_L1I, S0) {} { + pt_popTriggerQueue; + } + + transition(F_S1, L2_to_L1I, S1) {} { + pt_popTriggerQueue; + } + + transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + xs0_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} { + xs1_storeDone; + sdv_sendDoneValid; + uu_sendUnblock; + pr_popResponseQueue; + } + + transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + ra_sendReplAck; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} { + wb_data; + ra_sendReplAck; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + sdv_sendDoneValid; + nS_issueRdBlkS; + d_deallocateTBE; // FOO + pr_popResponseQueue; + } + + transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} { + wb_data; + i2_invL2; + a2_allocateL2; + sdv_sendDoneValid; + nS_issueRdBlkS; + d_deallocateTBE; // FOO + pr_popResponseQueue; + } + + // Writeback cancel "ack" + transition(I_C, NB_AckWB, I) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdi_sendDoneInvalid; + d_deallocateTBE; + pr_popResponseQueue; + } + + transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + transition(S_C, NB_AckWB, S) {L2TagArrayWrite} { + ss_sendStaleNotification; + sdv_sendDoneValid; + pr_popResponseQueue; + } + + // Begin Probe Transitions + + transition({Ms, M0, M1, O}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + i2_invL2; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S, I}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; // only relevant for S + pp_popProbeQueue; + } + + transition(S_C, {PrbInvData, PrbInvDataDemand}, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; // nothing will happen in I + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(S_C, PrbInv, I_C) {L2TagArrayWrite} { + t_allocateTBE; + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(I_C, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition({Ms, M0, M1, O}, {PrbShrData, PrbShrDataDemand}, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Es, E0, E1, S}, {PrbShrData, PrbShrDataDemand}, S) {L2TagArrayRead, L2TagArrayWrite} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_C, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({I, I_C}, {PrbShrData, PrbShrDataDemand}) {L2TagArrayRead} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M0) + a0_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M1, I_E1S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; // must invalidate current data (only relevant for I_M1) + a1_allocateL1D; // but make sure there is room for incoming data when it arrives + pp_popProbeQueue; + } + + transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbInvDataDemand, PrbShrData, PrbShrDataDemand}) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + a0_allocateL1D; + a1_allocateL1D; + pp_popProbeQueue; + } + + transition({I_M0, I_E0S, I_M1, I_E1S}, {PrbShrData, PrbShrDataDemand}) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition(ES_I, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, {PrbInvData, PrbInvDataDemand}, I_C) {} { + pdt_sendProbeResponseDataFromTBE; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(MO_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, PrbInv, I_C) {} { + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + pp_popProbeQueue; + } + + transition(ES_I, {PrbShrData, PrbShrDataDemand}, ES_I) {} { + ph_sendProbeResponseHit; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_I, {PrbShrData, PrbShrDataDemand}, MO_I) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition(MO_S0, {PrbInvData, PrbInvDataDemand}, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S1, {PrbInvData, PrbInvDataDemand}, S1_C) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdt_sendProbeResponseDataFromTBE; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + i2_invL2; + a2_allocateL2; + nS_issueRdBlkS; + d_deallocateTBE; + pp_popProbeQueue; + } + + transition({MO_S0, MO_S1}, {PrbShrData, PrbShrDataDemand}) {} { + pdt_sendProbeResponseDataFromTBE; + s_setSharedFlip; + pp_popProbeQueue; + } + + transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({S_F, Es_F}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + // invalidate everything you've got + ib_invBothClusters; + ii_invIcache; + i2_invL2; + // but make sure you have room for what you need from the fill + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition(Si_F0, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition(Si_F1, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + nS_issueRdBlkS; + pp_popProbeQueue; + } + + transition({Es_F0, E0_F, E1_Es}, {PrbShrData, PrbShrDataDemand}, S_F0) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({Es_F1, E1_F, E0_Es}, {PrbShrData, PrbShrDataDemand}, S_F1) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(Es_F, {PrbShrData, PrbShrDataDemand}, S_F) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition(S_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, O_M0}, {PrbInv}, I_M0) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(S_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition(O_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pdm_sendProbeResponseDataMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M1, O_M1}, {PrbInv}, I_M1) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pim_sendProbeResponseInvMs; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S0, S0_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S1, S1_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + ii_invIcache; + i2_invL2; + ai_allocateL1I; + a2_allocateL2; + pp_popProbeQueue; + } + + transition({S_M0, S_M1}, {PrbShrData, PrbShrDataDemand}) {} { + ph_sendProbeResponseHit; + pp_popProbeQueue; + } + + transition({O_M0, O_M1}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({S0, S1, S0_C, S1_C}, {PrbShrData, PrbShrDataDemand}) {} { + pb_sendProbeResponseBackprobe; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, {PrbInvData, PrbInvDataDemand}, IF_E0S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, {PrbInvData, PrbInvDataDemand}, IF_E1S) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, {PrbInvData, PrbInvDataDemand}, IF_ES) {L2DataArrayRead} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pd_sendProbeResponseData; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F, O_F}, PrbInv, IF_ES) {} { + forward_eviction_to_cpu0; + forward_eviction_to_cpu1; + pi_sendProbeResponseInv; + ib_invBothClusters; + i2_invL2; + a0_allocateL1D; + a1_allocateL1D; + a2_allocateL2; + n_issueRdBlk; + pp_popProbeQueue; + } + + transition({Ms_F0, M0_F, M1_Ms}, {PrbShrData, PrbShrDataDemand}, O_F0) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({Ms_F1, M1_F, M0_Ms}, {PrbShrData, PrbShrDataDemand}, O_F1) {} { + } + + transition({Ms_F}, {PrbShrData, PrbShrDataDemand}, O_F) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + transition({O_F0, O_F1, O_F}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} { + pd_sendProbeResponseData; + pp_popProbeQueue; + } + + // END TRANSITIONS +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm new file mode 100644 index 000000000..52d87fb8b --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm @@ -0,0 +1,2038 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:Directory, "AMD_Base-like protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + Cycles response_latency := 5; + Cycles response_latency_regionDir := 1; + Cycles l3_hit_latency := 30; + bool useL3OnWT := "False"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock"; + + // To the Cores + MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response"; + + // From region buffer + MessageBuffer * reqFromRegBuf, network="From", virtual_network="7", vnet_type="request"; + + // To Region directory + MessageBuffer * reqToRegDir, network="To", virtual_network="5", vnet_type="request"; + MessageBuffer * reqFromRegDir, network="From", virtual_network="5", vnet_type="request"; + MessageBuffer * unblockToRegDir, network="To", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * triggerQueue; + MessageBuffer * L3triggerQueue; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BR, AccessPermission:Backing_Store, desc="got CPU read request, blocked while sent to L3"; + BW, AccessPermission:Backing_Store, desc="got CPU write request, blocked while sent to L3"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it's possible for the data only to be in the network + // in the WB, L3 has sent it and gone on with its business in possibly I + // state. + BI, AccessPermission:Backing_Store, desc="Blocked waiting for inv ack from core"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + + // These are needed for when a private requests was issued before an inv was received + // for writebacks + BS_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BP_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + // for reads + BS_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BP_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + RdBlkSP, desc="..."; + RdBlkMP, desc="..."; + RdBlkP, desc="..."; + VicDirtyP, desc="..."; + VicCleanP, desc="..."; + WriteThroughP, desc="WriteThrough Message"; + AtomicP, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="WB response for a no longer valid request"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + LastCPUPrbResp, desc="Last Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="unblock, self triggered"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + StaleVicDirtyP, desc="Core invalidated before VicDirty processed"; + + // For region protocol + CPUReq, desc="Generic CPU request"; + Inv, desc="Region dir needs a block invalidated"; + Downgrade, desc="Region dir needs a block downgraded"; + + // For private accesses (bypassed reg-dir) + CPUReadP, desc="Initial req from core, sent to L3"; + CPUWriteP, desc="Initial req from core, sent to L3"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + } + + // TYPES + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + DataBlock DataBlkAux, desc="Auxiliary data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + bool DemandRequest, desc="for profiling"; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + bool TriggeredAcksComplete, default="false", desc="True if already triggered acks complete"; + WriteMask writeMask, desc="outstanding write through mask"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk); + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + State getStateFromAddr(Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if(directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(requestNetworkReg_out, CPURequestMsg, reqToRegDir); + out_port(regAckNetwork_out, UnblockMsg, unblockToRegDir); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=7) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + } else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=6) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=5) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=4) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + DPRINTF(RubySlicc, "core responses %s\n", in_msg); + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + if (is_valid(tbe) && tbe.NumPendingAcks == 1 + && tbe.TriggeredAcksComplete == false) { + trigger(Event:LastCPUPrbResp, in_msg.addr, entry, tbe); + } else { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=3) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(regBuf_in, CPURequestMsg, reqFromRegBuf, rank=2) { + if (regBuf_in.isReady(clockEdge())) { + peek(regBuf_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:ForceInv) { + trigger(Event:Inv, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:ForceDowngrade) { + trigger(Event:Downgrade, in_msg.addr, entry, tbe); + } else { + error("Bad request from region buffer"); + } + } + } + } + + in_port(regDir_in, CPURequestMsg, reqFromRegDir, rank=1) { + if (regDir_in.isReady(clockEdge())) { + peek(regDir_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad message type fwded from Region Dir"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Private) { + // Bypass the region dir + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlkP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkSP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkMP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:AtomicP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThroughP, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirtyP for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirtyP, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicCleanP for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicCleanP, in_msg.addr, entry, tbe); + } + } else { + error("Bad message type for private access"); + } + } else { + trigger(Event:CPUReq, in_msg.addr, entry, tbe); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } else { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := tbe.DemandRequest; + out_msg.L3Hit := tbe.L3Hit; + if (tbe.atomicData) { + out_msg.WTRequestor := tbe.WTRequestor; + } + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(sb_sendResponseSBypass, "sb", desc="send Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(esb_sendResponseESBypass, "esb", desc="send Exclusive or Shared response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached || in_msg.ForceShared) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(mbwt_sendResponseWriteThroughBypass, "mbwt", desc="send write through response") { + peek(requestNetwork_in, CPURequestMsg) { + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } else { + assert(in_msg.Type == CoherenceRequestType:Atomic); + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := getDirectoryEntry(address).DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := in_msg.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + out_msg.WTRequestor := in_msg.WTRequestor; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + + action(mb_sendResponseMBypass, "mb", desc="send Modified response") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.DemandRequest := false; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.DemandRequest := tbe.DemandRequest; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(cp_sendResponseCtoDP, "cp", desc="send CtoD Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(regDir_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } + } + + action(wp_sendResponseWBAckP, "wp", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + out_msg.DemandRequest := false; + } + } + } + + action(wc_sendResponseWBAck, "wc", desc="send WB Ack for cancel") { + peek(responseNetwork_in, ResponseMsg) { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Sender); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + + action(ra_ackRegionDir, "ra", desc="Ack region dir") { + peek(regDir_in, CPURequestMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(responseNetwork_out, ResponseMsg, response_latency_regionDir) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DirReadyAck; + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + } + } + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(regDir_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk); + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(lrp_queueMemRdReqP, "lrp", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk); + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dcr_probeInvCoreData, "dcr", desc="probe inv cores, return data") { + peek(regBuf_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := in_msg.Sharers; + tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count(); + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ddr_probeDownCoreData, "ddr", desc="probe inv cores, return data") { + peek(regBuf_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination := in_msg.Sharers; + tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count(); + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1; + out_msg.Destination.broadcast(MachineType:TCP); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP); + out_msg.Destination.broadcast(MachineType:SQC); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC); + out_msg.Destination.remove(in_msg.Requestor); + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1; + out_msg.Destination.broadcast(MachineType:TCP); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP); + out_msg.Destination.broadcast(MachineType:SQC); + tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC); + out_msg.Destination.remove(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk, + in_msg.addr); + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(regDir_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.ForwardRequestTime := curCycle(); + tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + tbe.DemandRequest := in_msg.DemandRequest; + } + } + + action(tp_allocateTBEP, "tp", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.ForwardRequestTime := curCycle(); + tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime; + tbe.DemandRequest := false; + } + } + + action(sa_setAcks, "sa", desc="setAcks") { + peek(regDir_in, CPURequestMsg) { + tbe.NumPendingAcks := in_msg.Acks; + APPEND_TRANSITION_COMMENT(" waiting for acks "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + } + + action(tr_allocateTBE, "tr", desc="allocate TBE Entry for Region inv") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.NumPendingAcks := 0; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(address); + unset_tbe(); + } + + action(wdp_writeBackDataPrivate, "wdp", desc="Write back data if needed") { + peek(requestNetwork_in, CPURequestMsg) { + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlkAux := getDirectoryEntry(address).DataBlk; + tbe.DataBlkAux.copyPartial(in_msg.DataBlk,in_msg.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlkAux; + } else{ + assert(in_msg.Type == CoherenceRequestType:Atomic); + tbe.DataBlkAux.atomicPartial(getDirectoryEntry(address).DataBlk,in_msg.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlkAux; + } + } + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + DataBlock tmp := getDirectoryEntry(address).DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == true) { + APPEND_TRANSITION_COMMENT(" Wrote data back "); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(wdi_writeBackDataInv, "wdi", desc="Write back inv data if needed") { + // Kind of opposite from above...? + if (tbe.Dirty == true) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + APPEND_TRANSITION_COMMENT("Writing dirty data to dir"); + DPRINTF(RubySlicc, "Data %s: %s\n", address, tbe.DataBlk); + } else { + APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir"); + } + } + + action(wdt_writeBackDataInvNoTBE, "wdt", desc="Write back inv data if needed no TBE") { + // Kind of opposite from above...? + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty == true) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + APPEND_TRANSITION_COMMENT("Writing dirty data to dir"); + DPRINTF(RubySlicc, "Data %s: %s\n", address, in_msg.DataBlk); + } else { + APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir"); + } + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(ml_writeL3DataToTBE, "ml", desc="write L3 data to TBE") { + assert(tbe.Dirty == false); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(yc_writeCPUDataToTBE, "yc", desc="write CPU Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.Dirty) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := false; + tbe.LastSender := in_msg.Sender; + } + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + if (tbe.NumPendingAcks > 0) { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + } else { + APPEND_TRANSITION_COMMENT(" Double ack! "); + } + assert(tbe.NumPendingAcks >= 0); + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + tbe.TriggeredAcksComplete := true; + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(ont_checkForCompletionNoTrigger, "ont", desc="check for ack completion, no trigger") { + if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) { + tbe.TriggeredAcksComplete := true; + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rvp_removeVicDirtyIgnore, "rvp", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(regDir_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(r_sendRequestToRegionDir, "r", desc="send request to Region Directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetworkReg_out, CPURequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + DPRINTF(RubySlicc, "out dest: %s\n", map_Address_to_RegionDir(address)); + } + } + } + + action(ai_ackInvalidate, "ai", desc="Ack to let the reg-dir know that the inv is ordered") { + peek(regBuf_in, CPURequestMsg) { + enqueue(regAckNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg); + } + } + } + + action(aic_ackInvalidate, "aic", desc="Ack to let the reg-dir know that the inv is ordered") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(regAckNetwork_out, UnblockMsg, 1) { + out_msg.addr := address; + if (machineIDToMachineType(in_msg.Sender) == MachineType:CorePair) { + out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(0))); + } else { + out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(1))); + } + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg); + out_msg.wasValid := in_msg.isValid; + } + } + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(ali_allocateL3Block, "ali", desc="allocate the L3 block on ForceInv") { + if (tbe.Dirty == true) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(ali_allocateL3BlockNoTBE, "alt", desc="allocate the L3 block on ForceInv no TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" ali wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" ali wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } + } + } + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(prd_popRegionQueue, "prd", desc="pop request queue") { + regDir_in.dequeue(clockEdge()); + } + + action(prb_popRegionBufQueue, "prb", desc="pop request queue") { + regBuf_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(ww_stallAndWaitRegRequestQueue, "ww", desc="recycle region dir request queue") { + stall_and_wait(regDir_in, address); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + + // transitions from U + + transition({BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Inv, Downgrade}) { + ww_stallAndWaitRegRequestQueue; + } + + transition(U, Inv, BI){L3TagArrayRead} { + tr_allocateTBE; + dcr_probeInvCoreData; // only need to invalidate sharers + ai_ackInvalidate; + prb_popRegionBufQueue; + } + + transition(U, Downgrade, BI){L3TagArrayRead} { + tr_allocateTBE; + ddr_probeDownCoreData; // only need to invalidate sharers + ai_ackInvalidate; + prb_popRegionBufQueue; + } + + // The next 2 transistions are needed in the event that an invalidation + // is waiting for its ack from the core, but the event makes it through + // the region directory before the acks. This wouldn't be needed if + // we waited to ack the region dir until the directory got all the acks + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, WriteThrough, Atomic}) { + ww_stallAndWaitRegRequestQueue; + } + + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {RdBlkSP, RdBlkMP, RdBlkP}) { + st_stallAndWaitRequest; + } + + transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {WriteThroughP,AtomicP}) { + st_stallAndWaitRequest; + } + + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, WriteThrough, BM_PM){L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, {RdBlkM,Atomic}, BM_PM){L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, RdBlk, B_PM){L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sa_setAcks; + o_checkForCompletion; + ra_ackRegionDir; + prd_popRegionQueue; + } + + transition(U, {RdBlkSP}, BS_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, WriteThroughP, BM_M) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, {RdBlkMP,AtomicP}, BM_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, RdBlkP, B_M) {L3TagArrayRead} { + tp_allocateTBEP; + lrp_queueMemRdReqP; + p_popRequestQueue; + } + + transition(U, VicDirtyP, BL) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(U, VicCleanP, BL) {L3TagArrayRead} { + tp_allocateTBEP; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkSP, BM_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkSP, BS_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkSP, B_Pm_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkSP, BP_B) {L3DataArrayWrite} { + sb_sendResponseSBypass; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkMP, BM_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkMP, BS_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkMP, B_Pm_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkMP, BP_B) {L3DataArrayWrite} { + mb_sendResponseMBypass; + p_popRequestQueue; + } + + transition(BM_Pm, {WriteThroughP,AtomicP}, BM_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BS_Pm, {WriteThroughP,AtomicP}, BS_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(B_Pm, {WriteThroughP,AtomicP}, B_Pm_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BP, {WriteThroughP,AtomicP}, BP_B) {L3DataArrayWrite} { + wdp_writeBackDataPrivate; + mbwt_sendResponseWriteThroughBypass; + p_popRequestQueue; + } + + transition(BM_Pm, RdBlkP, BM_Pm_B) {L3DataArrayWrite} { + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BS_Pm, RdBlkP, BS_Pm_B) {L3DataArrayWrite} { + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(B_Pm, RdBlkP, B_Pm_B) {L3DataArrayWrite}{ + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BP, RdBlkP, BP_B) {L3DataArrayWrite}{ + esb_sendResponseESBypass; + p_popRequestQueue; + } + + transition(BM_Pm_B, CoreUnblock, BM_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BS_Pm_B, CoreUnblock, BS_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B_Pm_B, CoreUnblock, B_Pm) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BP_B, CoreUnblock, BP) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(BM_Pm_B, UnblockWriteThrough, BM_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_Pm_B, UnblockWriteThrough, BS_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(B_Pm_B, UnblockWriteThrough, B_Pm) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BP_B, UnblockWriteThrough, BP) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BM_Pm, VicDirtyP, BM_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BS_Pm, VicDirtyP, BS_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(B_Pm, VicDirtyP, B_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BP, VicDirtyP, BP_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm, VicCleanP, BM_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BS_Pm, VicCleanP, BS_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(B_Pm, VicCleanP, B_Pm_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BP, VicCleanP, BP_BL) { + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition(BM_Pm_BL, CPUData, BM_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BS_Pm_BL, CPUData, BS_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(B_Pm_BL, CPUData, B_Pm) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BP_BL, CPUData, BP) { + yc_writeCPUDataToTBE; + d_writeDataToMemory; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition({BR, BW, BL}, {VicDirtyP, VicCleanP}) { + st_stallAndWaitRequest; + } + + transition({BR, BW, BL}, {VicDirty, VicClean}) { + ww_stallAndWaitRegRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + dt_deallocateTBE; + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({BI, B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirty, VicClean}) { + ww_stallAndWaitRegRequestQueue; + } + + transition({BI, B, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirtyP, VicCleanP}) { + st_stallAndWaitRequest; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, WBAck) { + pm_popMemQueue; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirtyP) { + rvp_removeVicDirtyIgnore; + wp_sendResponseWBAckP; + p_popRequestQueue; + } + + transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + ra_ackRegionDir; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + ra_ackRegionDir; + w_sendResponseWBAck; + prd_popRegionQueue; + } + + transition({B, BR}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition({B, BR}, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_M, MemData, B) {L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + wa_wakeUpDependents; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) { + wa_wakeUpDependents; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP, BI}, CPUPrbResp) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition({B, B_M, BS_M, BM_M}, {CPUPrbResp, LastCPUPrbResp}) { + z_stall; + } + + transition({BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {CPUPrbResp, LastCPUPrbResp}) { + // recycling because PrbResponse and data come on the same network + yy_recycleResponseQueue; + } + + transition(U, {CPUPrbResp, LastCPUPrbResp}) {L3TagArrayRead, L3DataArrayWrite} { + aic_ackInvalidate; + wdt_writeBackDataInvNoTBE; + ali_allocateL3BlockNoTBE; + pr_popResponseQueue; + } + + transition(BL, {CPUPrbResp, LastCPUPrbResp}) {} { + aic_ackInvalidate; + y_writeProbeDataToTBE; + wdi_writeBackDataInv; + ali_allocateL3Block; + pr_popResponseQueue; + } + + transition(BS_PM, LastCPUPrbResp, BS_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + pt_popTriggerQueue; + } + + transition(BM_PM, LastCPUPrbResp, BM_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + pt_popTriggerQueue; + } + + transition(B_PM, LastCPUPrbResp, B_M) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + pr_popResponseQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + pt_popTriggerQueue; + } + + transition(BS_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BI, LastCPUPrbResp, B) { + aic_ackInvalidate; + y_writeProbeDataToTBE; + x_decrementAcks; + ont_checkForCompletionNoTrigger; + wa_wakeUpDependents; + wdi_writeBackDataInv; + ali_allocateL3Block; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BI, ProbeAcksComplete, U) {L3TagArrayWrite, L3DataArrayWrite}{ + wa_wakeUpDependents; + wdi_writeBackDataInv; + ali_allocateL3Block; + dt_deallocateTBE; + pt_popTriggerQueue; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm new file mode 100644 index 000000000..823933e57 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +enumeration(CoherenceRequestType, desc="Coherence Request Types") { + // CPU Request Types ONLY + RdBlk, desc="Read Blk"; + RdBlkM, desc="Read Blk Modified"; + RdBlkS, desc="Read Blk Shared"; + VicClean, desc="L2 clean eviction"; + VicDirty, desc="L2 dirty eviction"; + + WrCancel, desc="want to cancel WB to Memory"; // should this be here? + + WBApproval, desc="WB Approval"; + + // Messages between Dir and R-Dir + ForceInv, desc="Send invalide to the block"; + ForceDowngrade, desc="Send downgrade to the block"; + Unblock, desc="Used to let the dir know a message has been sunk"; + + // Messages between R-Dir and R-Buffer + PrivateNotify, desc="Let region buffer know it has private access"; + SharedNotify, desc="Let region buffer know it has shared access"; + WbNotify, desc="Let region buffer know it saw its wb request"; + Downgrade, desc="Force the region buffer to downgrade to shared"; + // Response to R-Dir (probably should be on a different network, but + // I need it to be ordered with respect to requests) + InvAck, desc="Let the R-Dir know when the inv has occured"; + + PrivateRequest, desc="R-buf wants the region in private"; + UpgradeRequest, desc="R-buf wants the region in private"; + SharedRequest, desc="R-buf wants the region in shared (could respond with private)"; + CleanWbRequest, desc="R-buf wants to deallocate clean region"; + + NA, desc="So we don't get segfaults"; +} + +enumeration(ProbeRequestType, desc="Probe Request Types") { + PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbInv, desc="Probe to Invalidate"; + + // For regions + PrbRepl, desc="Force the cache to do a replacement"; + PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS +} + + +enumeration(CoherenceResponseType, desc="Coherence Response Types") { + NBSysResp, desc="Northbridge response to CPU Rd request"; + NBSysWBAck, desc="Northbridge response ok to WB"; + TDSysResp, desc="TCCdirectory response to CPU Rd request"; + TDSysWBAck, desc="TCCdirectory response ok to WB"; + TDSysWBNack, desc="TCCdirectory response ok to drop"; + CPUPrbResp, desc="CPU Probe Response"; + CPUData, desc="CPU Data"; + StaleNotif, desc="Notification of Stale WBAck, No data to writeback"; + CPUCancelWB, desc="want to cancel WB to Memory"; + MemData, desc="Data from Memory"; + + // for regions + PrivateAck, desc="Ack that r-buf received private notify"; + RegionWbAck, desc="Writeback Ack that r-buf completed deallocation"; + DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake"; +} + +enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") { + Modified, desc="Modified"; + Owned, desc="Owned state"; + Exclusive, desc="Exclusive"; + Shared, desc="Shared"; + NA, desc="NA"; +} + +structure(CPURequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + Addr DemandAddress, desc="Physical block address for this request"; + CoherenceRequestType Type, desc="Type of request"; + DataBlock DataBlk, desc="data for the cache line"; // only for WB + bool Dirty, desc="whether WB data is dirty"; // only for WB + MachineID Requestor, desc="Node who initiated the request"; + NetDest Destination, desc="Multicast destination mask"; + bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S"; + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + NetDest Sharers, desc="Caches that may have a valid copy of the data"; + bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E"; + bool Private, default="false", desc="Requestor already has private permissions, no need for dir check"; + bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk"; + + bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack"; + int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive"; + CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceRequestType:VicDirty) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(NBProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="probe signal"; + bool ReturnData, desc="Indicates CPU should return data"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer"; + Addr DemandAddress, desc="Demand block address for a region request"; + MachineID Requestor, desc="Requestor id for 3-hop requests"; + bool NoAckNeeded, default="false", desc="For short circuting acks"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +structure(TDProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="TD_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + MachineID Sender, desc="Node who sent the data"; + bool currentOwner, default="false", desc="Is the sender the current owner"; + bool DoneAck, default="false", desc="Is this a done ack?"; + bool Dirty, default="false", desc="Was block dirty when evicted"; + bool wasValid, default="false", desc="Was block valid when evicted"; + bool valid, default="false", desc="Is block valid"; + bool validToInvalid, default="false", desc="Was block valid when evicted"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +// Response Messages seemed to be easily munged into one type +structure(ResponseMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe"; + MachineID Sender, desc="Node who sent the data"; + NetDest Destination, desc="Node to whom the data is sent"; + // Begin Used Only By CPU Response + DataBlock DataBlk, desc="data for the cache line"; + bool Hit, desc="probe hit valid line"; + bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + bool Ntsl, desc="indicates probed lin will be invalid after probe"; + bool UntransferredOwner, desc="pending confirmation of ownership change"; + // End Used Only By CPU Response + + // Begin NB Response Only + CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in"; + bool CtoD, desc="was the originator a CtoD?"; + // End NB Response Only + + bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe"; + + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + bool L3Hit, default="false", desc="Did memory or L3 supply the data?"; + MachineID OriginalResponder, desc="Mach which wrote the data to the L3"; + + bool NotCached, default="false", desc="True when the Region buffer has already evicted the line"; + + bool NoAckNeeded, default="false", desc="For short circuting acks"; + bool isValid, default="false", desc="Is acked block valid"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceResponseType:CPUData || + Type == CoherenceResponseType:MemData) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(UnblockMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + NetDest Destination, desc="Destination (always directory)"; + MessageSizeType MessageSize, desc="size category of the message"; +} + +enumeration(TriggerType, desc="Trigger Type") { + L2_to_L1, desc="L2 to L1 fill"; + AcksComplete, desc="NB received all needed Acks"; + + // For regions + InvNext, desc="Invalidate the next block"; + PrivateAck, desc="Loopback ack for machines with no Region Buffer"; + AllOutstanding, desc="All outstanding requests have finished"; + L3Hit, desc="L3 hit in dir"; + + // For region directory once the directory is blocked + InvRegion, desc="Invalidate region"; + DowngradeRegion, desc="downgrade region"; +} + +enumeration(CacheId, desc="Which Cache in the Core") { + L1I, desc="L1 I-cache"; + L1D0, desc="L1 D-cache cluster 0"; + L1D1, desc="L1 D-cache cluster 1"; + NA, desc="Default"; +} + +structure(TriggerMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + TriggerType Type, desc="Type of trigger"; + CacheId Dest, default="CacheId_NA", desc="Cache to invalidate"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm new file mode 100644 index 000000000..89f7d6fcb --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm @@ -0,0 +1,1368 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Jason Power + */ + +machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol") +: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below + bool isOnCPU; + int blocksPerRegion := 64; // 4k regions + Cycles toDirLatency := 5; // Latency to fwd requests to directory + Cycles toRegionDirLatency := 5; // Latency for requests and acks to directory + Cycles nextEvictLatency := 1; // latency added between each block while evicting region + bool noTCCdir := "False"; + int TCC_select_num_bits := 1; + + // From the Cores + MessageBuffer * requestFromCore, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCore, network="From", virtual_network="2", vnet_type="response"; + + // Requests to the cores or directory + MessageBuffer * requestToNetwork, network="To", virtual_network="0", vnet_type="request"; + + // From Region-Dir + MessageBuffer * notifyFromRegionDir, network="From", virtual_network="7", vnet_type="request"; + MessageBuffer * probeFromRegionDir, network="From", virtual_network="8", vnet_type="request"; + + // From the directory + MessageBuffer * unblockFromDir, network="From", virtual_network="4", vnet_type="unblock"; + + // To the region-Dir + MessageBuffer * responseToRegDir, network="To", virtual_network="2", vnet_type="response"; + + MessageBuffer * triggerQueue; +{ + + // States + state_declaration(State, desc="Region states", default="RegionBuffer_State_NP") { + NP, AccessPermission:Invalid, desc="Not present in region directory"; + P, AccessPermission:Invalid, desc="Region is private to the cache"; + S, AccessPermission:Invalid, desc="Region is possibly shared with others"; + + NP_PS, AccessPermission:Invalid, desc="Intermediate state waiting for notify from r-dir"; + S_P, AccessPermission:Invalid, desc="Intermediate state while upgrading region"; + + P_NP, AccessPermission:Invalid, desc="Intermediate state while evicting all lines in region"; + P_S, AccessPermission:Invalid, desc="Intermediate state while downgrading all lines in region"; + + S_NP_PS, AccessPermission:Invalid, desc="Got an inv in S_P, waiting for all inv acks, then going to since the write is already out there NP_PS"; + P_NP_NP, AccessPermission:Invalid, desc="Evicting region on repl, then got an inv. Need to re-evict"; + + P_NP_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + P_S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + S_NP_PS_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests"; + + SS_P, AccessPermission:Invalid, desc="Waiting for CPU write that we know is there"; + + P_NP_W, AccessPermission:Invalid, desc="Waiting for writeback ack"; + + NP_W, AccessPermission:Invalid, desc="Got a done ack before request, waiting for that victim"; + } + + enumeration(Event, desc="Region directory events") { + CPURead, desc="Access from CPU core"; + CPUWrite, desc="Access from CPU core"; + CPUWriteback, desc="Writeback request from CPU core"; + + ReplRegion, desc="Start a replace on a region"; + + PrivateNotify, desc="Update entry to private state"; + SharedNotify, desc="Update entry to shared state"; + WbNotify, desc="Writeback notification received"; + InvRegion, desc="Start invalidating a region"; + DowngradeRegion,desc="Start invalidating a region"; + + InvAck, desc="Ack from core"; + + DoneAck, desc="Ack from core that request has finished"; + AllOutstanding, desc="All outstanding requests have now finished"; + + Evict, desc="Loopback to evict each block"; + LastAck_PrbResp, desc="Done eviciting all the blocks, got the last ack from core, now respond to region dir"; + LastAck_CleanWb, desc="Done eviciting all the blocks, got the last ack from core, now start clean writeback (note the dir has already been updated)"; + + StallAccess, desc="Wait for the done ack on the address before proceeding"; + StallDoneAck, desc="Wait for the access on the address before proceeding"; + + StaleRequest, desc="Got a stale victim from the cache, fwd it without incrementing outstanding"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + structure(BoolVec, external="yes") { + bool at(int); + void resize(int); + void clear(); + int size(); + } + + structure(Entry, desc="Region entry", interface="AbstractCacheEntry") { + Addr addr, desc="Base address of this region"; + State RegionState, desc="Region state"; + DataBlock DataBlk, desc="Data for the block (always empty in region buffer)"; + BoolVec ValidBlocks, desc="A vector to keep track of valid blocks"; + int NumValidBlocks, desc="Number of trues in ValidBlocks to avoid iterating"; + BoolVec UsedBlocks, desc="A vector to keep track of blocks ever valid"; + bool dirty, desc="Dirty as best known by the region buffer"; + // This is needed so we don't ack an invalidate until all requests are ordered + int NumOutstandingReqs, desc="Total outstanding private/shared requests"; + BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests"; + bool MustDowngrade, desc="Set when we got a downgrade before the shd or pvt permissions"; + Cycles ProbeRequestTime, default="Cycles(0)", desc="Time region dir started the probe"; + Cycles InitialRequestTime, default="Cycles(0)", desc="Time message was sent to region dir"; + bool MsgSentToDir, desc="True if the current request required a message to the dir"; + bool clearOnDone, default="false", desc="clear valid bit when request completes"; + Addr clearOnDoneAddr, desc="clear valid bit when request completes"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + //int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec"; + BoolVec ValidBlocks, desc="A vector to keep track of valid blocks"; + bool AllAcksReceived, desc="Got all necessary acks from dir"; + bool DoneEvicting, desc="Done iterating through blocks checking for valids"; + BoolVec AcksReceived, desc="Received acks for theses blocks\n"; + bool SendAck, desc="If true, send an ack to the r-dir at end of inv"; + ProbeRequestType MsgType, desc="Type of message to send while 'evicting' "; + int NumOutstandingReqs, desc="Total outstanding private/shared requests"; + BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests"; + MachineID Requestor, desc="Requestor for three hop transactions"; + bool DemandRequest, default="false", desc="Associated with a demand request"; + Addr DemandAddress, desc="Address for the demand request"; + bool DoneAckReceived, default="false", desc="True if the done ack arrived before the message"; + Addr DoneAckAddr, desc="Address of the done ack received early"; + int OutstandingThreshold, desc="Number of outstanding requests to trigger AllOutstanding on"; + + ProbeRequestType NewMsgType, desc="Type of message to send while 'evicting' "; + MachineID NewRequestor, desc="Requestor for three hop transactions"; + bool NewDemandRequest, default="false", desc="Associated with a demand request"; + Addr NewDemandAddress, desc="Address for the demand request"; + bool dirty, desc="dirty"; + bool AllOutstandingTriggered, default="false", desc="bit for only one all outstanding"; + int OutstandingAcks, default="0", desc="number of acks to wait for"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // Stores only region addresses + TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + int blockBits, default="RubySystem::getBlockSizeBits()"; + int blockBytes, default="RubySystem::getBlockSizeBytes()"; + int regionBits, default="log2(m_blocksPerRegion)"; + + // Functions + + int getRegionOffset(Addr addr) { + if (blocksPerRegion > 1) { + Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1); + int ret := addressToInt(offset); + assert(ret < blocksPerRegion); + return ret; + } else { + return 0; + } + } + + Addr getRegionBase(Addr addr) { + return maskLowOrderBits(addr, blockBits+regionBits); + } + + Addr getNextBlock(Addr addr) { + Addr a := addr; + return makeNextStrideAddress(a, 1); + } + + MachineID getPeer(MachineID mach, Addr address) { + if (isOnCPU) { + return createMachineID(MachineType:CorePair, intToID(0)); + } else if (noTCCdir) { + return mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + } else { + return createMachineID(MachineType:TCCdir, intToID(0)); + } + } + + bool isOutstanding(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe) && tbe.OutstandingReqs.size() > 0) { + DPRINTF(RubySlicc, " outstanding tbe reqs %s %s %d %d\n", + tbe.OutstandingReqs, addr, getRegionOffset(addr), + tbe.OutstandingReqs.at(getRegionOffset(addr))); + return tbe.OutstandingReqs.at(getRegionOffset(addr)); + } else if (is_valid(cache_entry)) { + DPRINTF(RubySlicc, " outstanding cache reqs %s %s %d %d\n", + cache_entry.OutstandingReqs, addr, getRegionOffset(addr), + cache_entry.OutstandingReqs.at(getRegionOffset(addr))); + return cache_entry.OutstandingReqs.at(getRegionOffset(addr)); + } else { + return false; + } + } + + bool isOnGPU() { + if (isOnCPU) { + return false; + } + return true; + } + + bool isRead(CoherenceRequestType type) { + return (type == CoherenceRequestType:RdBlk || type == CoherenceRequestType:RdBlkS || + type == CoherenceRequestType:VicClean); + } + + bool presentOrAvail(Addr addr) { + return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr)); + } + + // Returns a region entry! + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr))); + } + + TBE getTBE(Addr addr), return_by_pointer="yes" { + return TBEs.lookup(getRegionBase(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(getRegionBase(addr)).DataBlk; + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.RegionState; + } + return State:NP; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + if (is_valid(cache_entry)) { + cache_entry.RegionState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := getTBE(addr); + if(is_valid(tbe)) { + return RegionBuffer_State_to_permission(tbe.TBEState); + } + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return RegionBuffer_State_to_permission(cache_entry.RegionState); + } + return AccessPermission:NotPresent; + } + + void functionalRead(Addr addr, Packet *pkt) { + functionalMemoryRead(pkt); + } + + int functionalWrite(Addr addr, Packet *pkt) { + if (functionalMemoryWrite(pkt)) { + return 1; + } else { + return 0; + } + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(RegionBuffer_State_to_permission(state)); + } + } + + void recordRequestType(RequestType stat, Addr addr) { + if (stat == RequestType:TagArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (stat == RequestType:TagArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:TagArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + // Overloaded outgoing request nework for both probes to cores and reqeusts + // to the directory. + // Fix Me: These forwarded requests need to be on a separate virtual channel + // to avoid deadlock! + out_port(requestNetwork_out, CPURequestMsg, requestToNetwork); + out_port(probeNetwork_out, NBProbeRequestMsg, requestToNetwork); + + out_port(responseNetwork_out, ResponseMsg, responseToRegDir); + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=4) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := getTBE(in_msg.addr); + DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr)); + assert(is_valid(tbe)); + if (in_msg.Type == TriggerType:AcksComplete) { + if (tbe.SendAck) { + trigger(Event:LastAck_PrbResp, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:LastAck_CleanWb, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == TriggerType:AllOutstanding) { + trigger(Event:AllOutstanding, in_msg.addr, cache_entry, tbe); + } else { + assert(in_msg.Type == TriggerType:InvNext); + trigger(Event:Evict, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(unblockNetwork_in, UnblockMsg, unblockFromDir, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.DoneAck) { + if (isOutstanding(tbe, cache_entry, in_msg.addr)) { + trigger(Event:DoneAck, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:StallDoneAck, in_msg.addr, cache_entry, tbe); + } + } else { + assert(is_valid(tbe)); + trigger(Event:InvAck, in_msg.addr, cache_entry, tbe); + } + } + } + } + + in_port(probeNetwork_in, NBProbeRequestMsg, probeFromRegionDir, rank=2) { + if (probeNetwork_in.isReady(clockEdge())) { + peek(probeNetwork_in, NBProbeRequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + assert(getRegionBase(in_msg.addr) == in_msg.addr); + if (in_msg.Type == ProbeRequestType:PrbInv) { + trigger(Event:InvRegion, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) { + trigger(Event:DowngradeRegion, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown probe message\n"); + } + } + } + } + + in_port(notifyNetwork_in, CPURequestMsg, notifyFromRegionDir, rank=1) { + if (notifyNetwork_in.isReady(clockEdge())) { + peek(notifyNetwork_in, CPURequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + //Fix Me...add back in: assert(is_valid(cache_entry)); + if (in_msg.Type == CoherenceRequestType:WbNotify) { + trigger(Event:WbNotify, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:SharedNotify) { + trigger(Event:SharedNotify, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:PrivateNotify) { + trigger(Event:PrivateNotify, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown notify message\n"); + } + } + } + } + + // In from cores + // NOTE: We get the cache / TBE entry based on the region address, + // but pass the block address to the actions + in_port(requestNetwork_in, CPURequestMsg, requestFromCore, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (is_valid(tbe) && tbe.DoneAckReceived && tbe.DoneAckAddr == in_msg.addr) { + DPRINTF(RubySlicc, "Stale/Stall request %s\n", in_msg.Type); + if (in_msg.Type == CoherenceRequestType:VicDirty || in_msg.Type == CoherenceRequestType:VicClean ) + { + trigger(Event:StaleRequest, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe); + } + } else if (isOutstanding(tbe, cache_entry, in_msg.addr)) { + DPRINTF(RubySlicc, "Stall outstanding request %s\n", in_msg.Type); + trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe); + } else { + if (presentOrAvail(in_msg.addr)) { + if (in_msg.Type == CoherenceRequestType:RdBlkM ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic ) { + trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe); + } else { + if (in_msg.Type == CoherenceRequestType:VicDirty || + in_msg.Type == CoherenceRequestType:VicClean) { + trigger(Event:CPUWriteback, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:CPURead, in_msg.addr, cache_entry, tbe); + } + } + } else { + Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr)); + TBE victim_tbe := getTBE(victim); + Entry victim_entry := getCacheEntry(victim); + DPRINTF(RubySlicc, "Replacing region %s for %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr)); + trigger(Event:ReplRegion, victim, victim_entry, victim_tbe); + } + } + } + } + } + + // Actions + action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; + out_msg.Type := in_msg.Type; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := in_msg.Requestor; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := true; + out_msg.InitialRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + if (getState(tbe, cache_entry, address) == State:S) { + out_msg.ForceShared := true; + } + DPRINTF(RubySlicc, "Fwd: %s\n", out_msg); + //assert(getState(tbe, cache_entry, address) == State:P || getState(tbe, cache_entry, address) == State:S); + if (getState(tbe, cache_entry, address) == State:NP_W) { + APPEND_TRANSITION_COMMENT(" fwding stale request: "); + APPEND_TRANSITION_COMMENT(out_msg.Type); + } + } + } + } + + action(u_updateRegionEntry, "u", desc="Update the entry for profiling") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry)) { + if (in_msg.CtoDSinked == false) { + APPEND_TRANSITION_COMMENT(" incr outstanding "); + cache_entry.NumOutstandingReqs := 1 + cache_entry.NumOutstandingReqs; + assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)) == false); + cache_entry.OutstandingReqs.at(getRegionOffset(address)) := true; + assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs)); + } else { + APPEND_TRANSITION_COMMENT(" NOT incr outstanding "); + assert(in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:RdBlkS); + } + APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs); + if (in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:Atomic || + in_msg.Type == CoherenceRequestType:WriteThrough ) + { + cache_entry.dirty := true; + } + if (in_msg.Type == CoherenceRequestType:VicDirty || + in_msg.Type == CoherenceRequestType:VicClean) { + DPRINTF(RubySlicc, "Got %s for addr %s\n", in_msg.Type, address); + //assert(cache_entry.ValidBlocks.at(getRegionOffset(address))); + // can in fact be inv if core got an inv after a vicclean before it got here + if (cache_entry.ValidBlocks.at(getRegionOffset(address))) { + cache_entry.clearOnDone := true; + cache_entry.clearOnDoneAddr := address; + //cache_entry.ValidBlocks.at(getRegionOffset(address)) := false; + //cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1; + } + } else { + if (cache_entry.ValidBlocks.at(getRegionOffset(address)) == false) { + cache_entry.NumValidBlocks := cache_entry.NumValidBlocks + 1; + } + DPRINTF(RubySlicc, "before valid addr %s bits %s\n", + in_msg.Type, address, cache_entry.ValidBlocks); + cache_entry.ValidBlocks.at(getRegionOffset(address)) := true; + DPRINTF(RubySlicc, "after valid addr %s bits %s\n", + in_msg.Type, address, cache_entry.ValidBlocks); + cache_entry.UsedBlocks.at(getRegionOffset(address)) := true; + } + assert(cache_entry.NumValidBlocks <= blocksPerRegion); + assert(cache_entry.NumValidBlocks >= 0); + APPEND_TRANSITION_COMMENT(" valid blocks "); + APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks); + } else { + error("This shouldn't happen anymore I think"); + //tbe.ValidBlocks.at(getRegionOffest(address)) := true; + assert(getState(tbe, cache_entry, address) == State:P_NP); + } + } + } + + action(uw_updatePossibleWriteback, "uw", desc="writeback request complete") { + peek(unblockNetwork_in, UnblockMsg) { + if (is_valid(cache_entry) && in_msg.validToInvalid && + cache_entry.clearOnDone && cache_entry.clearOnDoneAddr == address) { + DPRINTF(RubySlicc, "I have no idea what is going on here\n"); + cache_entry.ValidBlocks.at(getRegionOffset(address)) := false; + cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1; + cache_entry.clearOnDone := false; + } + } + } + + + action(rp_requestPrivate, "rp", desc="Send private request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.DemandAddress := address; + out_msg.Type := CoherenceRequestType:PrivateRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + DPRINTF(RubySlicc, "Private request %s\n", out_msg); + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(ru_requestUpgrade, "ru", desc="Send upgrade request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:UpgradeRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(rw_requestWriteback, "rq", desc="Send writeback request") { + // No need to send acks on replacements + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:CleanWbRequest; + out_msg.Requestor := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.Dirty := tbe.dirty; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(rs_requestShared, "rs", desc="Send shared request r-dir") { + peek(requestNetwork_in, CPURequestMsg) { + // No need to send acks on replacements + assert(is_invalid(tbe)); + enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) { + out_msg.addr := address; // use the actual address so the demand request can be fulfilled + out_msg.Type := CoherenceRequestType:SharedRequest; + out_msg.OriginalType := in_msg.Type; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.InitialRequestTime := curCycle(); + // will this always be ok? probably not for multisocket + out_msg.Destination.add(map_Address_to_RegionDir(address)); + out_msg.MessageSize := MessageSizeType:Request_Control; + } + cache_entry.ProbeRequestTime := curCycle(); + cache_entry.MsgSentToDir := true; + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + } + } + + action(ai_ackRegionInv, "ai", desc="Send ack to r-dir on region inv if tbe says so") { + // No need to send acks on replacements + assert(is_valid(tbe)); + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ad_ackDircetory, "ad", desc="send probe response to directory") { + if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { //VIPER tcc doesnt understand PrbShrData + assert(tbe.DemandRequest); //So, let RegionBuffer take care of sending back ack + enqueue(responseNetwork_out, ResponseMsg, toDirLatency) { + out_msg.addr := tbe.DemandAddress; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := getPeer(machineID,address); + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.NoAckNeeded := true; + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + + action(aie_ackRegionExclusiveInv, "aie", desc="Send ack to r-dir on region inv if tbe says so") { + // No need to send acks on replacements + assert(is_valid(tbe)); + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.NotCached := true; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := tbe.dirty; + } + } + + action(ain_ackRegionInvNow, "ain", desc="Send ack to r-dir on region inv") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(aine_ackRegionInvExlusiveNow, "aine", desc="Send ack to r-dir on region inv with exlusive permission") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:CPUPrbResp; + out_msg.Sender := machineID; + out_msg.NotCached := true; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(ap_ackPrivateNotify, "ap", desc="Send ack to r-dir on private notify") { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:PrivateAck; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + + action(aw_ackWbNotify, "aw", desc="Send ack to r-dir on writeback notify") { + peek(notifyNetwork_in, CPURequestMsg) { + if (in_msg.NoAckNeeded == false) { + enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceResponseType:RegionWbAck; + out_msg.Sender := machineID; + out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket + out_msg.MessageSize := MessageSizeType:Response_Control; + } + } + } + } + + action(e_evictCurrent, "e", desc="Evict this block in the region") { + // send force invalidate message to directory to invalidate this block + // must invalidate all blocks since region buffer could have privitized it + if (tbe.ValidBlocks.at(getRegionOffset(address)) && + (tbe.DemandRequest == false || tbe.DemandAddress != address)) { + DPRINTF(RubySlicc, "trying to evict address %s (base: %s, offset: %d)\n", address, getRegionBase(address), getRegionOffset(address)); + DPRINTF(RubySlicc, "tbe valid blocks %s\n", tbe.ValidBlocks); + + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.Type := tbe.MsgType; + out_msg.ReturnData := true; + if (address == tbe.DemandAddress) { + out_msg.DemandRequest := true; + } + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(getPeer(machineID,address)); + DPRINTF(RubySlicc, "%s\n", out_msg); + } + APPEND_TRANSITION_COMMENT(" current "); + APPEND_TRANSITION_COMMENT(tbe.ValidBlocks.at(getRegionOffset(address))); + tbe.AllAcksReceived := false; + } else { + DPRINTF(RubySlicc, "Not evicting demand %s\n", address); + } + } + + action(ed_evictDemand, "ed", desc="Evict the demand request if it's valid") { + if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { + tbe.OutstandingAcks := 0; + tbe.AllAcksReceived := true; + tbe.DoneEvicting := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } else if (tbe.DemandRequest) { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := tbe.DemandAddress; + out_msg.Type := tbe.MsgType; + out_msg.ReturnData := true; + out_msg.DemandRequest := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.add(getPeer(machineID,address)); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.AllAcksReceived := false; + } + if (tbe.ValidBlocks.at(getRegionOffset(tbe.DemandAddress)) == false) { + tbe.OutstandingAcks := tbe.OutstandingAcks + 1; + } + APPEND_TRANSITION_COMMENT("Evicting demand "); + APPEND_TRANSITION_COMMENT(tbe.DemandAddress); + } + APPEND_TRANSITION_COMMENT("waiting acks "); + APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks); + } + + action(adp_AckDemandProbe, "fp", desc="forward demand probe even if we know that the core is invalid") { + peek(probeNetwork_in, NBProbeRequestMsg) { + if (in_msg.DemandRequest) { + enqueue(responseNetwork_out, ResponseMsg, toDirLatency) { + out_msg.addr := in_msg.DemandAddress; + out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes + out_msg.Sender := getPeer(machineID,address); + out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket + out_msg.Dirty := false; // only true if sending back data i think + out_msg.Hit := false; + out_msg.Ntsl := false; + out_msg.State := CoherenceState:NA; + out_msg.NoAckNeeded := true; + out_msg.MessageSize := MessageSizeType:Response_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } + } + + action(en_enqueueNextEvict, "en", desc="Queue evict the next block in the region") { + // increment in_msg.addr by blockSize bytes and enqueue on triggerPort + // Only enqueue if the next address doesn't overrun the region bound + if (getRegionBase(getNextBlock(address)) == getRegionBase(address)) { + enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) { + out_msg.Type := TriggerType:InvNext; + out_msg.addr := getNextBlock(address); + } + } else { + tbe.DoneEvicting := true; + DPRINTF(RubySlicc, "Done evicing region %s\n", getRegionBase(address)); + DPRINTF(RubySlicc, "Waiting for %s acks\n", tbe.OutstandingAcks); + if (tbe.AllAcksReceived == true) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } + } + } + + action(ef_enqueueFirstEvict, "ef", desc="Queue the first block in the region to be evicted") { + if (tbe.DoneEvicting == false) { + enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) { + out_msg.Type := TriggerType:InvNext; + out_msg.addr := getRegionBase(address); + } + } + } + + action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") { + DPRINTF(RubySlicc, "received ack for %s reg: %s vec: %s pos: %d\n", + address, getRegionBase(address), tbe.ValidBlocks, getRegionOffset(address)); + peek(unblockNetwork_in, UnblockMsg) { + // + // Note the tbe ValidBlock vec will be a conservative list of the + // valid blocks since the cache entry ValidBlock vec is set on the + // request + // + if (in_msg.wasValid) { + assert(tbe.ValidBlocks.at(getRegionOffset(address))); + } + } + tbe.OutstandingAcks := tbe.OutstandingAcks - 1; + tbe.AcksReceived.at(getRegionOffset(address)) := true; + assert(tbe.OutstandingAcks >= 0); + if (tbe.OutstandingAcks == 0) { + tbe.AllAcksReceived := true; + if (tbe.DoneEvicting) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := getRegionBase(address); + } + } + } + + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left receive "); + APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks); + } + + action(do_decrementOutstanding, "do", desc="Decrement outstanding requests") { + APPEND_TRANSITION_COMMENT(" decr outstanding "); + if (is_valid(cache_entry)) { + cache_entry.NumOutstandingReqs := cache_entry.NumOutstandingReqs - 1; + assert(cache_entry.OutstandingReqs.at(getRegionOffset(address))); + cache_entry.OutstandingReqs.at(getRegionOffset(address)) := false; + assert(cache_entry.NumOutstandingReqs >= 0); + assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs)); + APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs); + } + if (is_valid(tbe)) { + tbe.NumOutstandingReqs := tbe.NumOutstandingReqs - 1; + assert(tbe.OutstandingReqs.at(getRegionOffset(address))); + tbe.OutstandingReqs.at(getRegionOffset(address)) := false; + assert(tbe.NumOutstandingReqs >= 0); + assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs)); + APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs); + } + } + + action(co_checkOutstanding, "co", desc="check if there are no more outstanding requests") { + assert(is_valid(tbe)); + if ((tbe.NumOutstandingReqs <= tbe.OutstandingThreshold) && + (tbe.AllOutstandingTriggered == false)) { + APPEND_TRANSITION_COMMENT(" no more outstanding: "); + APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs); + APPEND_TRANSITION_COMMENT(tbe.OutstandingThreshold); + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AllOutstanding; + if (tbe.DemandRequest) { + out_msg.addr := tbe.DemandAddress; + } else { + out_msg.addr := getRegionBase(address); + } + DPRINTF(RubySlicc, "co enqueuing %s\n", out_msg); + tbe.AllOutstandingTriggered := true; + } + } else { + APPEND_TRANSITION_COMMENT(" still more outstanding "); + } + } + + action(ro_resetAllOutstanding, "ro", desc="Reset all outstanding") { + tbe.AllOutstandingTriggered := false; + } + + action(so_setOutstandingCheckOne, "so", desc="Check outstanding is waiting for 1, not 0") { + // Need this for S_P because one request is outstanding between here and r-dir + tbe.OutstandingThreshold := 1; + } + + action(a_allocateRegionEntry, "a", desc="Allocate a new entry") { + set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry)); + cache_entry.ValidBlocks.clear(); + cache_entry.ValidBlocks.resize(blocksPerRegion); + cache_entry.UsedBlocks.clear(); + cache_entry.UsedBlocks.resize(blocksPerRegion); + cache_entry.dirty := false; + cache_entry.NumOutstandingReqs := 0; + cache_entry.OutstandingReqs.clear(); + cache_entry.OutstandingReqs.resize(blocksPerRegion); + } + + action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") { + cacheMemory.deallocate(getRegionBase(address)); + unset_cache_entry(); + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(getRegionBase(address)); + set_tbe(getTBE(address)); + tbe.OutstandingAcks := 0; + tbe.AllAcksReceived := true; // starts true since the region could be empty + tbe.DoneEvicting := false; + tbe.AcksReceived.clear(); + tbe.AcksReceived.resize(blocksPerRegion); + tbe.SendAck := false; + tbe.OutstandingThreshold := 0; + if (is_valid(cache_entry)) { + tbe.NumOutstandingReqs := cache_entry.NumOutstandingReqs; + tbe.OutstandingReqs := cache_entry.OutstandingReqs; + assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs)); + tbe.dirty := cache_entry.dirty; + tbe.ValidBlocks := cache_entry.ValidBlocks; + tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks); + APPEND_TRANSITION_COMMENT(" tbe valid blocks "); + APPEND_TRANSITION_COMMENT(tbe.ValidBlocks); + APPEND_TRANSITION_COMMENT(" cache valid blocks "); + APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks); + } else { + tbe.dirty := false; + } + } + + action(m_markSendAck, "m", desc="Mark TBE that we need to ack at end") { + assert(is_valid(tbe)); + tbe.SendAck := true; + } + + action(db_markDirtyBit, "db", desc="Mark TBE dirty bit") { + peek(unblockNetwork_in, UnblockMsg) { + if (is_valid(tbe)) { + tbe.dirty := tbe.dirty || in_msg.Dirty; + } + } + } + + action(dr_markDoneAckReceived, "dr", desc="Mark TBE that a done ack has been received") { + assert(is_valid(tbe)); + tbe.DoneAckReceived := true; + tbe.DoneAckAddr := address; + APPEND_TRANSITION_COMMENT(" marking done ack on TBE "); + } + + action(se_setTBE, "se", desc="Set msg type to evict") { + peek(probeNetwork_in, NBProbeRequestMsg) { + tbe.MsgType := in_msg.Type; + tbe.Requestor := in_msg.Requestor; + tbe.DemandAddress := in_msg.DemandAddress; + tbe.DemandRequest := in_msg.DemandRequest; + } + } + + action(sne_setNewTBE, "sne", desc="Set msg type to evict") { + peek(probeNetwork_in, NBProbeRequestMsg) { + tbe.NewMsgType := in_msg.Type; + tbe.NewRequestor := in_msg.Requestor; + tbe.NewDemandAddress := in_msg.DemandAddress; + tbe.NewDemandRequest := in_msg.DemandRequest; + } + } + + action(soe_setOldTBE, "soe", desc="Set msg type to evict") { + tbe.MsgType := tbe.NewMsgType; + tbe.Requestor := tbe.NewRequestor; + tbe.DemandAddress := tbe.NewDemandAddress; + tbe.DemandRequest := tbe.NewDemandRequest; + tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks); + tbe.AllAcksReceived := true; // starts true since the region could be empty + tbe.DoneEvicting := false; + tbe.AcksReceived.clear(); + tbe.AcksReceived.resize(blocksPerRegion); + tbe.SendAck := false; + } + + action(ser_setTBE, "ser", desc="Set msg type to evict repl") { + tbe.MsgType := ProbeRequestType:PrbInv; + } + + action(md_setMustDowngrade, "md", desc="When permissions finally get here, must be shared") { + assert(is_valid(cache_entry)); + cache_entry.MustDowngrade := true; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(getRegionBase(address)); + unset_tbe(); + } + + action(p_popRequestQueue, "p", desc="Pop the request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pl_popUnblockQueue, "pl", desc="Pop the unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(pn_popNotifyQueue, "pn", desc="Pop the notify queue") { + notifyNetwork_in.dequeue(clockEdge()); + } + + action(pp_popProbeQueue, "pp", desc="Pop the probe queue") { + probeNetwork_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") { + DPRINTF(RubySlicc, "Trigger Before Contents: %s\n", triggerQueue_in); + triggerQueue_in.dequeue(clockEdge()); + DPRINTF(RubySlicc, "Trigger After Contents: %s\n", triggerQueue_in); + } + + // Must always use wake all, since non-region address wait on region addresses + action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(zz_stallAndWaitRequestQueue, "\z", desc="recycle request queue") { + Addr regAddr := getRegionBase(address); + DPRINTF(RubySlicc, "Stalling address %s\n", regAddr); + stall_and_wait(requestNetwork_in, regAddr); + } + + action(yy_stallAndWaitProbeQueue, "\y", desc="stall probe queue") { + Addr regAddr := getRegionBase(address); + stall_and_wait(probeNetwork_in, regAddr); + } + + action(yyy_recycleProbeQueue, "\yy", desc="recycle probe queue") { + probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(zzz_recycleRequestQueue, "\zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(www_recycleUnblockNetwork, "\ww", desc="recycle unblock queue") { + unblockNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(z_stall, "z", desc="stall request queue") { + // fake state + } + + action(mru_setMRU, "mru", desc="set MRU") { + cacheMemory.setMRU(address, cache_entry.NumValidBlocks); + } + + // Transitions + + transition({NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, P_NP_W, P_NP_NP, NP_W}, {CPURead, CPUWriteback, CPUWrite}) {} { + zz_stallAndWaitRequestQueue; + } + + transition(SS_P, {CPURead, CPUWriteback}) { + zz_stallAndWaitRequestQueue; + } + + transition({NP, S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, NP_W, P_NP_NP}, StallAccess) {} { + zz_stallAndWaitRequestQueue; + } + + transition({S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, P_NP_W, P_NP_NP, NP_W}, StallDoneAck) { + www_recycleUnblockNetwork; + } + + transition(NP, StallDoneAck, NP_W) { + t_allocateTBE; + db_markDirtyBit; + dr_markDoneAckReceived; + pl_popUnblockQueue; + } + + transition(NP_W, StaleRequest, NP) { + f_fwdReqToDir; + dt_deallocateTBE; + wa_wakeUpAllDependents; + p_popRequestQueue; + } + + transition(P_NP_O, DowngradeRegion) {} { + z_stall; // should stall and wait + } + + transition({NP_PS, S_NP_PS, S_P, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P}, ReplRegion) {} { + zz_stallAndWaitRequestQueue; // can't let things get out of order! + } + + transition({P_NP_O, S_O, SS_P}, InvRegion) {} { + yyy_recycleProbeQueue; // can't be z_stall because there could be a RdBlkM in the requestQueue which has the sinked flag which is blocking the inv + } + + transition(P_NP, {InvRegion, DowngradeRegion}, P_NP_NP) {} { + sne_setNewTBE; + pp_popProbeQueue; + } + + transition(S_P, DowngradeRegion) {} { + adp_AckDemandProbe; + ain_ackRegionInvNow; + pp_popProbeQueue; + } + + transition(P_NP_W, InvRegion) { + adp_AckDemandProbe; + ain_ackRegionInvNow; + pp_popProbeQueue; + } + + transition(P_NP_W, DowngradeRegion) { + adp_AckDemandProbe; + aine_ackRegionInvExlusiveNow; + pp_popProbeQueue; + } + + transition({P, S}, {CPURead, CPUWriteback}) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + f_fwdReqToDir; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(P, CPUWrite) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + f_fwdReqToDir; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(S, CPUWrite, S_O) {TagArrayRead} { + mru_setMRU; + t_allocateTBE; + co_checkOutstanding; + zz_stallAndWaitRequestQueue; + } + + transition(S_O, AllOutstanding, SS_P) { + wa_wakeUpAllDependents; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(SS_P, CPUWrite, S_P) { + mru_setMRU; + dt_deallocateTBE; + ru_requestUpgrade; + u_updateRegionEntry; + p_popRequestQueue; + } + + transition(NP, {CPURead, CPUWriteback}, NP_PS) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + rs_requestShared; + u_updateRegionEntry; + p_popRequestQueue;//zz_stallAndWaitRequestQueue; + } + + transition(NP, CPUWrite, NP_PS) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + rp_requestPrivate; + u_updateRegionEntry; + p_popRequestQueue;//zz_stallAndWaitRequestQueue; + } + + transition(NP_PS, PrivateNotify, P) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(S_P, PrivateNotify, P) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(NP_PS, SharedNotify, S) {} { + ap_ackPrivateNotify; + wa_wakeUpAllDependents; + pn_popNotifyQueue; + } + + transition(P_NP_W, WbNotify, NP) {} { + aw_ackWbNotify; + wa_wakeUpAllDependents; + dt_deallocateTBE; + pn_popNotifyQueue; + } + + transition({P, S}, ReplRegion, P_NP_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + ser_setTBE; + d_deallocateRegionEntry; + co_checkOutstanding; + } + + transition({P, S}, InvRegion, P_NP_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + d_deallocateRegionEntry; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(P_NP_O, AllOutstanding, P_NP) {} { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(S_P, InvRegion, S_NP_PS_O) {TagArrayRead} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + so_setOutstandingCheckOne; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(S_NP_PS_O, AllOutstanding, S_NP_PS) { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition(P, DowngradeRegion, P_S_O) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + se_setTBE; + m_markSendAck; + co_checkOutstanding; + pp_popProbeQueue; + } + + transition(P_S_O, AllOutstanding, P_S) {} { + ed_evictDemand; + ef_enqueueFirstEvict; + ro_resetAllOutstanding; + pt_popTriggerQueue; + } + + transition({P, S}, DoneAck) {TagArrayWrite} { + do_decrementOutstanding; + wa_wakeUpAllDependents; + db_markDirtyBit; + uw_updatePossibleWriteback; + pl_popUnblockQueue; + } + + transition({S_P, NP_PS, S_NP_PS}, DoneAck) {TagArrayWrite} { + www_recycleUnblockNetwork; + } + + transition({P_NP_O, S_NP_PS_O, P_S_O, S_O}, DoneAck) {} { + do_decrementOutstanding; + co_checkOutstanding; + db_markDirtyBit; + uw_updatePossibleWriteback; + pl_popUnblockQueue; + } + + transition({P_NP, P_S, S_NP_PS, P_NP_NP}, Evict) {} { + e_evictCurrent; + en_enqueueNextEvict; + pt_popTriggerQueue; + } + + transition({P_NP, P_S, S_NP_PS, P_NP_NP}, InvAck) {} { + ra_receiveAck; + db_markDirtyBit; + pl_popUnblockQueue; + } + + transition(P_NP, LastAck_CleanWb, P_NP_W) {} { + rw_requestWriteback; + pt_popTriggerQueue; + } + + transition(P_NP_NP, LastAck_CleanWb, P_NP) {} { + soe_setOldTBE; + m_markSendAck; + ed_evictDemand; + ef_enqueueFirstEvict; + pt_popTriggerQueue; + } + + transition(P_NP, LastAck_PrbResp, NP) {} { + aie_ackRegionExclusiveInv; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(S_NP_PS, LastAck_PrbResp, NP_PS) {} { + aie_ackRegionExclusiveInv; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(P_S, LastAck_PrbResp, S) {} { + ai_ackRegionInv; + ad_ackDircetory; + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + +} + diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm new file mode 100644 index 000000000..b392311c5 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm @@ -0,0 +1,1187 @@ +/* + * Copyright (c) 2012-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Jason Power + */ + +machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol") +: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below + NodeID cpuRegionBufferNum; + NodeID gpuRegionBufferNum; + int blocksPerRegion := 64; // 4k regions + Cycles toDirLatency := 10; // Latency to fwd requests and send invs to directory + bool always_migrate := "False"; + bool sym_migrate := "False"; + bool asym_migrate := "False"; + bool noTCCdir := "False"; + int TCC_select_num_bits := 1; + + // To the directory + MessageBuffer * requestToDir, network="To", virtual_network="5", vnet_type="request"; + + // To the region buffers + MessageBuffer * notifyToRBuffer, network="To", virtual_network="7", vnet_type="request"; + MessageBuffer * probeToRBuffer, network="To", virtual_network="8", vnet_type="request"; + + // From the region buffers + MessageBuffer * responseFromRBuffer, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * requestFromRegBuf, network="From", virtual_network="0", vnet_type="request"; + + MessageBuffer * triggerQueue; +{ + + // States + state_declaration(State, desc="Region states", default="RegionDir_State_NP") { + NP, AccessPermission:Invalid, desc="Not present in region directory"; + P, AccessPermission:Invalid, desc="Region is private to owner"; + S, AccessPermission:Invalid, desc="Region is shared between CPU and GPU"; + + P_NP, AccessPermission:Invalid, desc="Evicting the region"; + NP_P, AccessPermission:Invalid, desc="Must wait for ack from R-buf"; + NP_S, AccessPermission:Invalid, desc="Must wait for ack from R-buf"; + P_P, AccessPermission:Invalid, desc="Waiting for ack from R-buf"; + S_S, AccessPermission:Invalid, desc="Waiting for ack from R-buf"; + P_S, AccessPermission:Invalid, desc="Downgrading the region"; + S_P, AccessPermission:Invalid, desc="Upgrading the region"; + P_AS, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + S_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + P_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks"; + + SP_NP_W, AccessPermission:Invalid, desc="Last sharer writing back, waiting for ack"; + S_W, AccessPermission:Invalid, desc="Sharer writing back, waiting for ack"; + + P_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + P_AS_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + S_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack"; + } + + enumeration(Event, desc="Region directory events") { + SendInv, desc="Send inv message to any machine that has a region buffer"; + SendUpgrade, desc="Send upgrade message to any machine that has a region buffer"; + SendDowngrade, desc="Send downgrade message to any machine that has a region buffer"; + + Evict, desc="Evict this region"; + + UpgradeRequest, desc="Request from r-buf for an upgrade"; + SharedRequest, desc="Request from r-buf for read"; + PrivateRequest, desc="Request from r-buf for write"; + + InvAckCore, desc="Ack from region buffer to order the invalidate"; + InvAckCoreNoShare, desc="Ack from region buffer to order the invalidate, and it does not have the region"; + CPUPrivateAck, desc="Ack from region buffer to order private notification"; + + LastAck, desc="Done eviciting all the blocks"; + + StaleCleanWbRequest, desc="stale clean writeback reqeust"; + StaleCleanWbRequestNoShare, desc="stale clean wb req from a cache which should be removed from sharers"; + CleanWbRequest, desc="clean writeback reqeust, multiple sharers"; + CleanWbRequest_LastSharer, desc="clean writeback reqeust, last sharer"; + WritebackAck, desc="Writeback Ack from region buffer"; + DirReadyAck, desc="Directory is ready, waiting Ack from region buffer"; + + TriggerInv, desc="trigger invalidate message"; + TriggerDowngrade, desc="trigger downgrade message"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + DataArrayRead, desc="Read the data array"; + DataArrayWrite, desc="Write the data array"; + TagArrayRead, desc="Read the data array"; + TagArrayWrite, desc="Write the data array"; + } + + structure(BoolVec, external="yes") { + bool at(int); + void resize(int); + void clear(); + } + + structure(Entry, desc="Region entry", interface="AbstractCacheEntry") { + Addr addr, desc="Base address of this region"; + NetDest Sharers, desc="Set of machines that are sharing, but not owners"; + State RegionState, desc="Region state"; + DataBlock DataBlk, desc="Data for the block (always empty in region dir)"; + MachineID Owner, desc="Machine which owns all blocks in this region"; + Cycles ProbeStart, desc="Time when the first probe request was issued"; + bool LastWriten, default="false", desc="The last time someone accessed this region, it wrote it"; + bool LastWritenByCpu, default="false", desc="The last time the CPU accessed this region, it wrote it"; + bool LastWritenByGpu, default="false", desc="The last time the GPU accessed this region, it wrote it"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + MachineID Owner, desc="Machine which owns all blocks in this region"; + NetDest Sharers, desc="Set of machines to send evicts"; + int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec"; + bool AllAcksReceived, desc="Got all necessary acks from dir"; + CoherenceRequestType MsgType, desc="Msg type for the evicts could be inv or dwngrd"; + Cycles ProbeRequestTime, default="Cycles(0)", desc="Start of probe request"; + Cycles InitialRequestTime, default="Cycles(0)", desc="To forward back on out msg"; + Addr DemandAddress, desc="Demand address from original request"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + // Stores only region addresses + TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs"; + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_cache_entry(AbstractCacheEntry b); + void unset_cache_entry(); + void set_tbe(TBE b); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + int blockBits, default="RubySystem::getBlockSizeBits()"; + int blockBytes, default="RubySystem::getBlockSizeBytes()"; + int regionBits, default="log2(m_blocksPerRegion)"; + + // Functions + + MachineID getCoreMachine(MachineID rBuf, Addr address) { + if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) { + return createMachineID(MachineType:CorePair, intToID(0)); + } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) { + if (noTCCdir) { + return mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits); + } else { + return createMachineID(MachineType:TCCdir, intToID(0)); + } + } else { + error("Unexpected region buffer number"); + } + } + + bool isCpuMachine(MachineID rBuf) { + if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) { + return true; + } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) { + return false; + } else { + error("Unexpected region buffer number"); + } + } + + bool symMigrate(Entry cache_entry) { + return cache_entry.LastWriten; + } + + bool asymMigrate(Entry cache_entry, MachineID requestor) { + if (isCpuMachine(requestor)) { + return cache_entry.LastWritenByCpu; + } else { + return cache_entry.LastWritenByGpu; + } + } + + int getRegionOffset(Addr addr) { + if (blocksPerRegion > 1) { + Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1); + int ret := addressToInt(offset); + assert(ret < blocksPerRegion); + return ret; + } else { + return 0; + } + } + + Addr getRegionBase(Addr addr) { + return maskLowOrderBits(addr, blockBits+regionBits); + } + + Addr getNextBlock(Addr addr) { + Addr a := addr; + makeNextStrideAddress(a, 1); + return a; + } + + bool presentOrAvail(Addr addr) { + DPRINTF(RubySlicc, "Present? %s, avail? %s\n", cacheMemory.isTagPresent(getRegionBase(addr)), cacheMemory.cacheAvail(getRegionBase(addr))); + return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr)); + } + + // Returns a region entry! + Entry getCacheEntry(Addr addr), return_by_pointer="yes" { + return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr))); + } + + TBE getTBE(Addr addr), return_by_pointer="yes" { + return TBEs.lookup(getRegionBase(addr)); + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + return getCacheEntry(getRegionBase(addr)).DataBlk; + } + + State getState(TBE tbe, Entry cache_entry, Addr addr) { + if (is_valid(tbe)) { + return tbe.TBEState; + } else if (is_valid(cache_entry)) { + return cache_entry.RegionState; + } + return State:NP; + } + + void setState(TBE tbe, Entry cache_entry, Addr addr, State state) { + if (is_valid(tbe)) { + tbe.TBEState := state; + } + if (is_valid(cache_entry)) { + cache_entry.RegionState := state; + } + } + + AccessPermission getAccessPermission(Addr addr) { + TBE tbe := getTBE(addr); + if(is_valid(tbe)) { + return RegionDir_State_to_permission(tbe.TBEState); + } + Entry cache_entry := getCacheEntry(addr); + if(is_valid(cache_entry)) { + return RegionDir_State_to_permission(cache_entry.RegionState); + } + return AccessPermission:NotPresent; + } + + void setAccessPermission(Entry cache_entry, Addr addr, State state) { + if (is_valid(cache_entry)) { + cache_entry.changePermission(RegionDir_State_to_permission(state)); + } + } + + void functionalRead(Addr addr, Packet *pkt) { + functionalMemoryRead(pkt); + } + + int functionalWrite(Addr addr, Packet *pkt) { + if (functionalMemoryWrite(pkt)) { + return 1; + } else { + return 0; + } + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:DataArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:TagArrayRead) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:TagArrayWrite) { + cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:DataArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:DataArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:TagArrayRead) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:TagArrayWrite) { + return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + + out_port(requestNetwork_out, CPURequestMsg, requestToDir); + out_port(notifyNetwork_out, CPURequestMsg, notifyToRBuffer); + out_port(probeNetwork_out, NBProbeRequestMsg, probeToRBuffer); + + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=2) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + Entry cache_entry := getCacheEntry(in_msg.addr); + TBE tbe := getTBE(in_msg.addr); + DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + assert(is_valid(tbe)); + trigger(Event:LastAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:InvRegion) { + assert(is_valid(tbe)); + trigger(Event:TriggerInv, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == TriggerType:DowngradeRegion) { + assert(is_valid(tbe)); + trigger(Event:TriggerDowngrade, in_msg.addr, cache_entry, tbe); + } else { + error("Unknown trigger message"); + } + } + } + } + + in_port(responseNetwork_in, ResponseMsg, responseFromRBuffer, rank=1) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := getTBE(in_msg.addr); + Entry cache_entry := getCacheEntry(in_msg.addr); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + assert(is_valid(tbe)); + if (in_msg.NotCached) { + trigger(Event:InvAckCoreNoShare, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:InvAckCore, in_msg.addr, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceResponseType:PrivateAck) { + assert(in_msg.addr == getRegionBase(in_msg.addr)); + assert(is_valid(cache_entry)); + //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender)); + trigger(Event:CPUPrivateAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:RegionWbAck) { + //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender) == false); + assert(in_msg.addr == getRegionBase(in_msg.addr)); + trigger(Event:WritebackAck, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:DirReadyAck) { + assert(is_valid(tbe)); + trigger(Event:DirReadyAck, getRegionBase(in_msg.addr), cache_entry, tbe); + } else { + error("Invalid response type"); + } + } + } + } + + // In from cores + // NOTE: We get the cache / TBE entry based on the region address, + // but pass the block address to the actions + in_port(requestNetwork_in, CPURequestMsg, requestFromRegBuf, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + //assert(in_msg.addr == getRegionBase(in_msg.addr)); + Addr address := getRegionBase(in_msg.addr); + DPRINTF(RubySlicc, "Got %s, base %s\n", in_msg.addr, address); + if (presentOrAvail(address)) { + TBE tbe := getTBE(address); + Entry cache_entry := getCacheEntry(address); + if (in_msg.Type == CoherenceRequestType:PrivateRequest) { + if (is_valid(cache_entry) && (cache_entry.Owner != in_msg.Requestor || + getState(tbe, cache_entry, address) == State:S)) { + trigger(Event:SendInv, address, cache_entry, tbe); + } else { + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:SharedRequest) { + if (is_invalid(cache_entry)) { + // If no one has ever requested this region give private permissions + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } else { + if (always_migrate || + (sym_migrate && symMigrate(cache_entry)) || + (asym_migrate && asymMigrate(cache_entry, in_msg.Requestor))) { + if (cache_entry.Sharers.count() == 1 && + cache_entry.Sharers.isElement(in_msg.Requestor)) { + trigger(Event:UpgradeRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendInv, address, cache_entry, tbe); + } + } else { // don't migrate + if(cache_entry.Sharers.isElement(in_msg.Requestor) || + getState(tbe, cache_entry, address) == State:S) { + trigger(Event:SharedRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendDowngrade, address, cache_entry, tbe); + } + } + } + } else if (in_msg.Type == CoherenceRequestType:UpgradeRequest) { + if (is_invalid(cache_entry)) { + trigger(Event:PrivateRequest, address, cache_entry, tbe); + } else if (cache_entry.Sharers.count() == 1 && cache_entry.Sharers.isElement(in_msg.Requestor)) { + trigger(Event:UpgradeRequest, address, cache_entry, tbe); + } else { + trigger(Event:SendUpgrade, address, cache_entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:CleanWbRequest) { + if (is_invalid(cache_entry) || cache_entry.Sharers.isElement(in_msg.Requestor) == false) { + trigger(Event:StaleCleanWbRequest, address, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "wb address %s(%s) owner %s sharers %s requestor %s %d %d\n", in_msg.addr, getRegionBase(in_msg.addr), cache_entry.Owner, cache_entry.Sharers, in_msg.Requestor, cache_entry.Sharers.isElement(in_msg.Requestor), cache_entry.Sharers.count()); + if (cache_entry.Sharers.isElement(in_msg.Requestor) && cache_entry.Sharers.count() == 1) { + DPRINTF(RubySlicc, "last wb\n"); + trigger(Event:CleanWbRequest_LastSharer, address, cache_entry, tbe); + } else { + DPRINTF(RubySlicc, "clean wb\n"); + trigger(Event:CleanWbRequest, address, cache_entry, tbe); + } + } + } else { + error("unknown region dir request type"); + } + } else { + Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr)); + TBE victim_tbe := getTBE(victim); + Entry victim_entry := getCacheEntry(victim); + DPRINTF(RubySlicc, "Evicting address %s for new region at address %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr)); + assert(is_valid(victim_entry)); + trigger(Event:Evict, victim, victim_entry, victim_tbe); + } + } + } + } + + // Actions + + action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) { + out_msg.Acks := cache_entry.Sharers.count(); + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirShared, "fs", desc="Forward CPU request to directory (shared)") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + out_msg.ForceShared := true; + if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) { + out_msg.Acks := cache_entry.Sharers.count(); + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirWithAck, "fa", desc="Forward CPU request to directory with ack request") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + if (is_valid(cache_entry)) { + out_msg.Acks := cache_entry.Sharers.count(); + // Don't need an ack from the requestor! + if (cache_entry.Sharers.isElement(in_msg.Requestor)) { + out_msg.Acks := out_msg.Acks - 1; + } + } else { + out_msg.Acks := 0; + } + } + } + } + + action(f_fwdReqToDirWithAckShared, "fas", desc="Forward CPU request to directory with ack request") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) { + out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address + out_msg.Type := in_msg.OriginalType; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := in_msg.Dirty; + out_msg.Requestor := getCoreMachine(in_msg.Requestor,address); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Destination.add(map_Address_to_Directory(in_msg.addr)); + out_msg.Shared := in_msg.Shared; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.Private := in_msg.Private; + out_msg.NoAckNeeded := false; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ProbeRequestStartTime := curCycle(); + out_msg.DemandRequest := true; + out_msg.ForceShared := true; + if (is_valid(cache_entry)) { + out_msg.Acks := cache_entry.Sharers.count(); + // Don't need an ack from the requestor! + if (cache_entry.Sharers.isElement(in_msg.Requestor)) { + out_msg.Acks := out_msg.Acks - 1; + } + } else { + out_msg.Acks := 0; + } + } + } + } + + action(a_allocateRegionEntry, "a", desc="Allocate a new entry") { + set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry)); + peek(requestNetwork_in, CPURequestMsg) { + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + } + } + + action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") { + cacheMemory.deallocate(getRegionBase(address)); + unset_cache_entry(); + } + + action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") { + //assert(tbe.ValidBlocks.at(getRegionOffset(address))); + DPRINTF(RubySlicc, "received ack for %s reg: %s\n", address, getRegionBase(address)); + tbe.NumValidBlocks := tbe.NumValidBlocks - 1; + assert(tbe.NumValidBlocks >= 0); + if (tbe.NumValidBlocks == 0) { + tbe.AllAcksReceived := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := address; + } + } + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left receive "); + APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks); + } + + action(ca_checkAcks, "ca", desc="Check to see if we need more acks") { + if (tbe.NumValidBlocks == 0) { + tbe.AllAcksReceived := true; + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:AcksComplete; + out_msg.addr := address; + } + } + } + + action(ti_triggerInv, "ti", desc="") { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:InvRegion; + out_msg.addr := address; + } + } + + action(td_triggerDowngrade, "td", desc="") { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.Type := TriggerType:DowngradeRegion; + out_msg.addr := address; + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(getRegionBase(address)); + set_tbe(getTBE(address)); + if (is_valid(cache_entry)) { + tbe.Owner := cache_entry.Owner; + tbe.Sharers := cache_entry.Sharers; + tbe.AllAcksReceived := true; // assume no acks are required + } + tbe.ProbeRequestTime := curCycle(); + peek(requestNetwork_in, CPURequestMsg) { + tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.DemandAddress := in_msg.addr; + } + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" Acks left "); + APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks); + APPEND_TRANSITION_COMMENT(" Owner, "); + APPEND_TRANSITION_COMMENT(tbe.Owner); + APPEND_TRANSITION_COMMENT(" sharers, "); + APPEND_TRANSITION_COMMENT(tbe.Sharers); + } + + action(ss_setSharers, "ss", desc="Add requestor to sharers") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.add(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(rs_removeSharer, "rs", desc="Remove requestor to sharers") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Sharers.remove(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" removing "); + APPEND_TRANSITION_COMMENT(in_msg.Requestor); + APPEND_TRANSITION_COMMENT(" sharers "); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(rsr_removeSharerResponse, "rsr", desc="Remove requestor to sharers") { + peek(responseNetwork_in, ResponseMsg) { + cache_entry.Sharers.remove(in_msg.Sender); + APPEND_TRANSITION_COMMENT(cache_entry.Sharers); + } + } + + action(cs_clearSharers, "cs", desc="Add requestor to sharers") { + cache_entry.Sharers.clear(); + } + + action(so_setOwner, "so", desc="Set the owner to the requestor") { + peek(requestNetwork_in, CPURequestMsg) { + cache_entry.Owner := in_msg.Requestor; + APPEND_TRANSITION_COMMENT(" Owner now: "); + APPEND_TRANSITION_COMMENT(cache_entry.Owner); + } + } + + action(rr_removeRequestorFromTBE, "rr", desc="Remove requestor from TBE sharers") { + peek(requestNetwork_in, CPURequestMsg) { + tbe.Sharers.remove(in_msg.Requestor); + } + } + + action(ur_updateDirtyStatusOnRequest, "ur", desc="Update dirty status on demand request") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry)) { + if ((in_msg.Type == CoherenceRequestType:SharedRequest) && + (cache_entry.Sharers.isElement(in_msg.Requestor) == false)) { + cache_entry.LastWriten := false; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := false; + } else { + cache_entry.LastWritenByGpu := false; + } + } else if ((in_msg.Type == CoherenceRequestType:PrivateRequest) || + (in_msg.Type == CoherenceRequestType:UpgradeRequest)) { + cache_entry.LastWriten := true; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := true; + } else { + cache_entry.LastWritenByGpu := true; + } + } + } + } + } + + action(ud_updateDirtyStatusWithWb, "ud", desc="Update dirty status on writeback") { + peek(requestNetwork_in, CPURequestMsg) { + if (is_valid(cache_entry) && in_msg.Dirty) { + cache_entry.LastWriten := true; + if (isCpuMachine(in_msg.Requestor)) { + cache_entry.LastWritenByCpu := true; + } else { + cache_entry.LastWritenByGpu := true; + } + } + } + } + + action(sns_setNumAcksSharers, "sns", desc="Set number of acks to one per shared region buffer") { + assert(is_valid(tbe)); + assert(is_valid(cache_entry)); + tbe.NumValidBlocks := tbe.Sharers.count(); + } + + action(sno_setNumAcksOne, "sno", desc="Set number of acks to one per shared region buffer") { + assert(is_valid(tbe)); + assert(is_valid(cache_entry)); + tbe.NumValidBlocks := 1; + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + TBEs.deallocate(getRegionBase(address)); + APPEND_TRANSITION_COMMENT(" reg: "); + APPEND_TRANSITION_COMMENT(getRegionBase(address)); + unset_tbe(); + } + + action(wb_sendWbNotice, "wb", desc="Send notice to cache that writeback is acknowledged") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:WbNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(wbn_sendWbNoticeNoAck, "wbn", desc="Send notice to cache that writeback is acknowledged (no ack needed)") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:WbNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.NoAckNeeded := true; + } + } + } + + action(b_sendPrivateNotice, "b", desc="Send notice to private cache that it has private access") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:PrivateNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(bs_sendSharedNotice, "bs", desc="Send notice to private cache that it has private access") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:SharedNotify; + out_msg.Destination.add(in_msg.Requestor); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + } + } + } + + action(c_sendSharedNoticeToOrigReq, "c", desc="Send notice to private cache that it has shared access") { + assert(is_valid(tbe)); + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:SharedNotify; + out_msg.Destination.add(tbe.Owner); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(sp_sendPrivateNoticeToOrigReq, "sp", desc="Send notice to private cache that it has private access") { + assert(is_valid(tbe)); + enqueue(notifyNetwork_out, CPURequestMsg, 1) { + out_msg.addr := getRegionBase(address); + out_msg.Type := CoherenceRequestType:PrivateNotify; + out_msg.Destination.add(tbe.Owner); + out_msg.Requestor := machineID; + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(i_RegionInvNotify, "i", desc="Send notice to private cache that it no longer has private access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.DemandAddress := tbe.DemandAddress; + //out_msg.Requestor := tbe.Requestor; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbInv; + //Fix me: assert(tbe.Sharers.count() > 0); + out_msg.DemandRequest := true; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(i0_RegionInvNotifyDemand0, "i0", desc="Send notice to private cache that it no longer has private access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + // Demand address should default to 0 -> out_msg.DemandAddress := 0; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(rd_RegionDowngrade, "rd", desc="Send notice to private cache that it only has shared access") { + enqueue(probeNetwork_out, NBProbeRequestMsg, 1) { + out_msg.addr := address; + out_msg.DemandAddress := tbe.DemandAddress; + out_msg.Requestor := machineID; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.DemandRequest := true; + out_msg.Destination := tbe.Sharers; + out_msg.MessageSize := MessageSizeType:Request_Control; + APPEND_TRANSITION_COMMENT("dest: "); + APPEND_TRANSITION_COMMENT(out_msg.Destination); + } + } + + action(p_popRequestQueue, "p", desc="Pop the request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="Pop the response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(s_stallAndWaitRequest, "s", desc="Stall and wait on the region address") { + Addr regAddr := getRegionBase(address); + stall_and_wait(requestNetwork_in, regAddr); + } + + action(w_wakeUpRegionDependents, "w", desc="Wake up any requests waiting for this region") { + wakeUpBuffers(getRegionBase(address)); + } + + action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(zz_recycleRequestQueue, "\z", desc="...") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(z_stall, "z", desc="stall request queue") { + // fake state + } + + action(mru_setMRU, "mru", desc="set MRU") { + cacheMemory.setMRU(address); + } + + // Transistions + + transition({NP_P, P_P, NP_S, S_S, S_P, P_S, P_NP, S_AP, P_AS, P_AP, SP_NP_W, S_W, P_AP_W, P_AS_W, S_AP_W}, {PrivateRequest, SharedRequest, UpgradeRequest, SendInv, SendUpgrade, SendDowngrade, CleanWbRequest, CleanWbRequest_LastSharer, StaleCleanWbRequest}) { + s_stallAndWaitRequest + } + + transition({NP_P, P_P, NP_S, S_S, S_P, S_W, P_S, P_NP, S_AP, P_AS, P_AP, P_AP_W, P_AS_W, S_AP_W}, Evict) { + zz_recycleRequestQueue; + } + + transition(NP, {PrivateRequest, SendUpgrade}, NP_P) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + so_setOwner; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + transition(P, {PrivateRequest, UpgradeRequest}, P_P) {TagArrayRead} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + t_allocateTBE; + p_popRequestQueue; + } + + transition({NP_P, P_P}, CPUPrivateAck, P) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({NP, P, S}, StaleCleanWbRequest) {TagArrayRead, TagArrayWrite} { + wbn_sendWbNoticeNoAck; + ud_updateDirtyStatusWithWb; + p_popRequestQueue; + } + + transition(NP, SharedRequest, NP_S) {TagArrayRead, TagArrayWrite} { + a_allocateRegionEntry; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirShared; + bs_sendSharedNotice; + so_setOwner; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + // Could probably do this in parallel with other shared requests + transition(S, SharedRequest, S_S) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirShared; + bs_sendSharedNotice; + ss_setSharers; + t_allocateTBE; + p_popRequestQueue; + } + + transition({P, S}, CleanWbRequest_LastSharer, SP_NP_W) {TagArrayRead, TagArrayWrite} { + ud_updateDirtyStatusWithWb; + wb_sendWbNotice; + rs_removeSharer; + t_allocateTBE; + d_deallocateRegionEntry; + p_popRequestQueue; + } + + transition(S, CleanWbRequest, S_W) {TagArrayRead, TagArrayWrite} { + ud_updateDirtyStatusWithWb; + wb_sendWbNotice; + rs_removeSharer; + t_allocateTBE; + p_popRequestQueue; + } + + transition(SP_NP_W, WritebackAck, NP) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(S_W, WritebackAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({NP_S, S_S}, CPUPrivateAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(S, UpgradeRequest, S_P) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDir; + b_sendPrivateNotice; + so_setOwner; + t_allocateTBE; + p_popRequestQueue; + } + + transition(S_P, CPUPrivateAck, P) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition(P, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + //i_RegionInvNotify; + p_popRequestQueue; + } + + transition({P_AP_W, S_AP_W}, DirReadyAck) { + ti_triggerInv; + pr_popResponseQueue; + } + + transition(P_AS_W, DirReadyAck) { + td_triggerDowngrade; + pr_popResponseQueue; + } + + transition(P_AS_W, TriggerDowngrade, P_AS) { + rd_RegionDowngrade; + pt_popTriggerQueue; + } + + transition(P_AP_W, TriggerInv, P_AP) { + i_RegionInvNotify; + pt_popTriggerQueue; + } + + transition(S_AP_W, TriggerInv, S_AP) { + i_RegionInvNotify; + pt_popTriggerQueue; + } + + transition(P, SendUpgrade, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(P, Evict, P_NP) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + sns_setNumAcksSharers; + i0_RegionInvNotifyDemand0; + d_deallocateRegionEntry; + } + + transition(S, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(S, Evict, P_NP) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + sns_setNumAcksSharers; + i0_RegionInvNotifyDemand0; + d_deallocateRegionEntry; + } + + transition(P_NP, LastAck, NP) { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pt_popTriggerQueue; + } + + transition(S, SendUpgrade, S_AP_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAck; + so_setOwner; + t_allocateTBE; + rr_removeRequestorFromTBE; + sns_setNumAcksSharers; + cs_clearSharers; + ss_setSharers; + p_popRequestQueue; + } + + transition(S_AP, LastAck, S_P) { + sp_sendPrivateNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P_AP, LastAck, P_P) { + sp_sendPrivateNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P, SendDowngrade, P_AS_W) {TagArrayRead, TagArrayWrite} { + mru_setMRU; + ur_updateDirtyStatusOnRequest; + f_fwdReqToDirWithAckShared; + so_setOwner; + t_allocateTBE; + sns_setNumAcksSharers; + ss_setSharers; //why do we set the sharers before sending the downgrade? Are we sending a downgrade to the requestor? + p_popRequestQueue; + } + + transition(P_AS, LastAck, P_S) { + c_sendSharedNoticeToOrigReq; + pt_popTriggerQueue; + } + + transition(P_S, CPUPrivateAck, S) { + dt_deallocateTBE; + w_wakeUpRegionDependents; + pr_popResponseQueue; + } + + transition({P_NP, P_AS, S_AP, P_AP}, InvAckCore) {} { + ra_receiveAck; + pr_popResponseQueue; + } + + transition({P_NP, S_AP, P_AP}, InvAckCoreNoShare) {} { + ra_receiveAck; + pr_popResponseQueue; + } + + transition(P_AS, InvAckCoreNoShare) {} { + ra_receiveAck; + rsr_removeSharerResponse; + pr_popResponseQueue; + } + +} + + diff --git a/src/mem/protocol/MOESI_AMD_Base-dir.sm b/src/mem/protocol/MOESI_AMD_Base-dir.sm new file mode 100644 index 000000000..52cefda66 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-dir.sm @@ -0,0 +1,1137 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +machine(MachineType:Directory, "AMD Baseline protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + Cycles response_latency := 5; + Cycles l3_hit_latency := 50; + bool noTCCdir := "False"; + bool CPUonly := "False"; + int TCC_select_num_bits; + bool useL3OnWT := "False"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock"; + + MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response"; + + MessageBuffer * triggerQueue; + MessageBuffer * L3triggerQueue; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it's possible for the data only to be in the network + // in the WB, L3 has sent it and gone on with its business in possibly I + // state. + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + CtoD, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="Notification that WB has been superceded by a probe"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="Unblock because of writethrough request finishing"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + } + + // TYPES + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + WriteMask writeMask, desc="outstanding write through mask"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs"; + + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if (directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + }else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad request message type"); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + }else{ + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + if(tbe.atomicData){ + out_msg.WTRequestor := tbe.WTRequestor; + } + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + } + } + } + + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { + peek(responseNetwork_in, ResponseMsg) { + queueMemoryWrite(machineID, address, to_memory_controller_latency, + in_msg.DataBlk); + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + if (tbe.Dirty == false) { + tbe.DataBlk := entry.DataBlk; + } + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + + // add relevant TCC node to list. This replaces all TCPs and SQCs + if (((in_msg.Type == CoherenceRequestType:WriteThrough || + in_msg.Type == CoherenceRequestType:Atomic) && + in_msg.NoWriteConflict) || + CPUonly) { + } else if (noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if (noTCCdir || CPUonly) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, TCC_select_num_bits)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && !CPUonly) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if (noTCCdir && !CPUonly) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + if (!noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address, + MachineType:TCCdir, + TCC_select_low_bit, + TCC_select_num_bits)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + if (tbe.Dirty == false) { + // have to update the TBE, too, because of how this + // directory deals with functional writes + tbe.DataBlk := in_msg.DataBlk; + } + } + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); + tbe.Dirty := true; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + TBEs.deallocate(address); + unset_tbe(); + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask); + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.wtData == true) { + // do nothing + } else if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + tbe.writeMask.fillMask(); + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty "); + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(sf_setForwardReqTime, "sf", desc="...") { + tbe.ForwardRequestTime := curCycle(); + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + st_stallAndWaitRequest; + } + + // It may be possible to save multiple invalidations here! + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) { + st_stallAndWaitRequest; + } + + + // transitions from U + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + sc_probeShrCoreData; + p_popRequestQueue; + } + + transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + w_sendResponseWBAck; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead} { + t_allocateTBE; + l_queueMemRdReq; + dc_probeInvCoreData; + p_popRequestQueue; + } + + transition(U, RdBlk, B_PM) {L3TagArrayRead}{ + t_allocateTBE; + l_queueMemRdReq; + sc_probeShrCoreData; + p_popRequestQueue; + } + + transition(U, CtoD, BP) {L3TagArrayRead} { + t_allocateTBE; + ic_probeInvCore; + p_popRequestQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(BL, {VicDirty, VicClean}) { + zz_recycleRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + dt_deallocateTBE; + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { + z_stall; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { + pm_popMemQueue; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({B}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) {} { + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) { + y_writeProbeDataToTBE; + x_decrementAcks; + o_checkForCompletion; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } +} diff --git a/src/mem/protocol/MOESI_AMD_Base-msg.sm b/src/mem/protocol/MOESI_AMD_Base-msg.sm new file mode 100644 index 000000000..ff8842369 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-msg.sm @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2010-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + + +enumeration(CoherenceRequestType, desc="Coherence Request Types") { + // CPU Request Types ONLY + RdBlk, desc="Read Blk"; + RdBlkM, desc="Read Blk Modified"; + RdBlkS, desc="Read Blk Shared"; + CtoD, desc="Change To Dirty"; + VicClean, desc="L2 clean eviction"; + VicDirty, desc="L2 dirty eviction"; + Atomic, desc="Upper level atomic"; + AtomicWriteBack, desc="Upper level atomic"; + WriteThrough, desc="Ordered WriteThrough w/Data"; + WriteThroughFifo, desc="WriteThrough with no data"; + WriteThroughDummy, desc="WriteThrough with no data for atomic operation"; + WriteFlush, desc="Release Flush"; + + WrCancel, desc="want to cancel WB to Memory"; // should this be here? + + WBApproval, desc="WB Approval"; + + // Messages between Dir and R-Dir + ForceInv, desc="Send invalide to the block"; + ForceDowngrade, desc="Send downgrade to the block"; + Unblock, desc="Used to let the dir know a message has been sunk"; + + // Messages between R-Dir and R-Buffer + PrivateNotify, desc="Let region buffer know it has private access"; + SharedNotify, desc="Let region buffer know it has shared access"; + WbNotify, desc="Let region buffer know it saw its wb request"; + Downgrade, desc="Force the region buffer to downgrade to shared"; + // Response to R-Dir (probably should be on a different network, but + // I need it to be ordered with respect to requests) + InvAck, desc="Let the R-Dir know when the inv has occured"; + + PrivateRequest, desc="R-buf wants the region in private"; + UpgradeRequest, desc="R-buf wants the region in private"; + SharedRequest, desc="R-buf wants the region in shared (could respond with private)"; + CleanWbRequest, desc="R-buf wants to deallocate clean region"; + + NA, desc="So we don't get segfaults"; +} + +enumeration(ProbeRequestType, desc="Probe Request Types") { + PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbInv, desc="Probe to Invalidate"; + + // For regions + PrbRepl, desc="Force the cache to do a replacement"; + PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS + PrbAtomic, desc="Forwarded Atomic Operation"; +} + + +enumeration(CoherenceResponseType, desc="Coherence Response Types") { + NBSysResp, desc="Northbridge response to CPU Rd request"; + NBSysWBAck, desc="Northbridge response ok to WB"; + TDSysResp, desc="TCCdirectory response to CPU Rd request"; + TDSysWBAck, desc="TCCdirectory response ok to WB"; + TDSysWBNack, desc="TCCdirectory response ok to drop"; + CPUPrbResp, desc="CPU Probe Response"; + CPUData, desc="CPU Data"; + StaleNotif, desc="Notification of Stale WBAck, No data to writeback"; + CPUCancelWB, desc="want to cancel WB to Memory"; + MemData, desc="Data from Memory"; + + // for regions + PrivateAck, desc="Ack that r-buf received private notify"; + RegionWbAck, desc="Writeback Ack that r-buf completed deallocation"; + DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake"; +} + +enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") { + Modified, desc="Modified"; + Owned, desc="Owned state"; + Exclusive, desc="Exclusive"; + Shared, desc="Shared"; + NA, desc="NA"; +} + +structure(CPURequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + Addr DemandAddress, desc="Physical block address for this request"; + CoherenceRequestType Type, desc="Type of request"; + DataBlock DataBlk, desc="data for the cache line"; // only for WB + bool Dirty, desc="whether WB data is dirty"; // only for WB + MachineID Requestor, desc="Node who initiated the request"; + NetDest Destination, desc="Multicast destination mask"; + bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S"; + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + NetDest Sharers, desc="Caches that may have a valid copy of the data"; + bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E"; + bool Private, default="false", desc="Requestor already has private permissions, no need for dir check"; + bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk"; + + bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack"; + int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive"; + CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; + WriteMask writeMask, desc="Write Through Data"; + MachineID WTRequestor, desc="Node who initiated the write through"; + HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope"; + int wfid, default="0", desc="wavefront id"; + bool NoWriteConflict, default="true", desc="write collided with CAB entry"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceRequestType:VicDirty) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(NBProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="NB_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer"; + Addr DemandAddress, desc="Demand block address for a region request"; + MachineID Requestor, desc="Requestor id for 3-hop requests"; + bool NoAckNeeded, default="false", desc="For short circuting acks"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +structure(TDProbeRequestMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + ProbeRequestType Type, desc="TD_PrbNxtState signal"; + bool ReturnData, desc="Indicates CPU should return data"; + bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)"; + NetDest Destination, desc="Node to whom the data is sent"; + MessageSizeType MessageSize, desc="size category of the message"; + int Phase, desc="Synchronization Phase"; + int wfid, desc="wavefront id for Release"; + MachineID Requestor, desc="Node who initiated the request"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +// Response Messages seemed to be easily munged into one type +structure(ResponseMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe"; + MachineID Sender, desc="Node who sent the data"; + NetDest Destination, desc="Node to whom the data is sent"; + // Begin Used Only By CPU Response + DataBlock DataBlk, desc="data for the cache line"; + bool Hit, desc="probe hit valid line"; + bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O"; + bool Dirty, desc="Is the data dirty (different than memory)?"; + bool Ntsl, desc="indicates probed lin will be invalid after probe"; + bool UntransferredOwner, desc="pending confirmation of ownership change"; + // End Used Only By CPU Response + + // Begin NB Response Only + CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in"; + bool CtoD, desc="was the originator a CtoD?"; + // End NB Response Only + + // Normally if a block gets hit by a probe while waiting to be written back, + // you flip the NbReqShared signal (part of the CPURequest signal group). + // But since this is in packets and I don't want to send a separate packet, + // let's just send this signal back with the data instead + bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe"; + + MessageSizeType MessageSize, desc="size category of the message"; + Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache"; + Cycles ForwardRequestTime, desc="time the dir forwarded the request"; + Cycles ProbeRequestStartTime, desc="the time the dir started the probe request"; + bool DemandRequest, default="false", desc="For profiling purposes"; + + bool L3Hit, default="false", desc="Did memory or L3 supply the data?"; + MachineID OriginalResponder, desc="Mach which wrote the data to the L3"; + MachineID WTRequestor, desc="Node who started the writethrough"; + + bool NotCached, default="false", desc="True when the Region buffer has already evicted the line"; + + bool NoAckNeeded, default="false", desc="For short circuting acks"; + bool isValid, default="false", desc="Is acked block valid"; + int wfid, default="0", desc="wavefront id"; + int Phase, desc="Synchronization Phase"; + + int ProgramCounter, desc="PC that issues this request"; + bool mispred, desc="tell TCP if the block should not be bypassed"; + + + bool functionalRead(Packet *pkt) { + // Only PUTX messages contains the data block + if (Type == CoherenceResponseType:CPUData || + Type == CoherenceResponseType:MemData) { + return testAndRead(addr, DataBlk, pkt); + } + + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return testAndWrite(addr, DataBlk, pkt); + } +} + +structure(UnblockMsg, desc="...", interface="Message") { + Addr addr, desc="Physical address for this request"; + NetDest Destination, desc="Destination (always directory)"; + MessageSizeType MessageSize, desc="size category of the message"; + MachineID Sender, desc="Node who sent the data"; + bool currentOwner, default="false", desc="Is the sender the current owner"; + bool DoneAck, default="false", desc="Is this a done ack?"; + bool Dirty, default="false", desc="Was block dirty when evicted"; + bool wasValid, default="false", desc="Was block valid when evicted"; + bool valid, default="false", desc="Is block valid"; + bool validToInvalid, default="false", desc="Was block valid when evicted"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } +} + +enumeration(TriggerType, desc="Trigger Type") { + L2_to_L1, desc="L2 to L1 fill"; + AcksComplete, desc="NB received all needed Acks"; + + // For regions + InvNext, desc="Invalidate the next block"; + PrivateAck, desc="Loopback ack for machines with no Region Buffer"; + AllOutstanding, desc="All outstanding requests have finished"; + L3Hit, desc="L3 hit in dir"; + + // For region directory once the directory is blocked + InvRegion, desc="Invalidate region"; + DowngradeRegion, desc="downgrade region"; + //For writethrough + UnblockWriteThrough, desc="unblock"; + WriteData, desc="Write to full cacheblock data"; + WriteDone, desc="Sequencer says that write is done"; + AtomicDone, desc="Atomic is done"; +} + +enumeration(CacheId, desc="Which Cache in the Core") { + L1I, desc="L1 I-cache"; + L1D0, desc="L1 D-cache cluster 0"; + L1D1, desc="L1 D-cache cluster 1"; + NA, desc="Default"; +} + +structure(TriggerMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + TriggerType Type, desc="Type of trigger"; + CacheId Dest, default="CacheId_NA", desc="Cache to invalidate"; + int ProgramCounter, desc="PC that accesses to this block"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} + +enumeration(FifoType, desc="Fifo Type") { + WriteDummy, desc="Dummy Write for atomic operation"; + WriteThrough, desc="simple writethrough request"; + WriteFlush, desc="synchronization message"; +} + +structure(FifoMsg, desc="...", interface="Message") { + Addr addr, desc="Address"; + FifoType Type, desc="WriteThrough/WriteFlush"; + int wfid, default="0",desc="wavefront id"; + MachineID Requestor, desc="Flush Requestor"; + MachineID oRequestor, desc="original Flush Requestor"; + + bool functionalRead(Packet *pkt) { + return false; + } + + bool functionalWrite(Packet *pkt) { + // No check on message type required since the protocol should + // read data from those messages that contain the block + return false; + } + +} diff --git a/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm new file mode 100644 index 000000000..f545c2fa7 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm @@ -0,0 +1,1408 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu, Sooraj Puthoor + */ + +/* + * This file is based on MOESI_AMD_Base.sm + * Differences with AMD base protocol + * -- Uses a probe filter memory to track sharers. + * -- The probe filter can be inclusive or non-inclusive + * -- Only two sharers tracked. Sharers are a) GPU or/and b) CPU + * -- If sharer information available, the sharer is probed + * -- If sharer information not available, probes are broadcasted + */ + +machine(MachineType:Directory, "AMD Baseline protocol") +: DirectoryMemory * directory; + CacheMemory * L3CacheMemory; + CacheMemory * ProbeFilterMemory; + Cycles response_latency := 5; + Cycles l3_hit_latency := 50; + bool noTCCdir := "False"; + bool CAB_TCC := "False"; + int TCC_select_num_bits:=1; + bool useL3OnWT := "False"; + bool inclusiveDir := "True"; + Cycles to_memory_controller_latency := 1; + + // From the Cores + MessageBuffer * requestFromCores, network="From", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseFromCores, network="From", virtual_network="2", ordered="false", vnet_type="response"; + MessageBuffer * unblockFromCores, network="From", virtual_network="4", ordered="false", vnet_type="unblock"; + + MessageBuffer * probeToCore, network="To", virtual_network="0", ordered="false", vnet_type="request"; + MessageBuffer * responseToCore, network="To", virtual_network="2", ordered="false", vnet_type="response"; + + MessageBuffer * triggerQueue, ordered="true"; + MessageBuffer * L3triggerQueue, ordered="true"; + MessageBuffer * responseFromMemory; +{ + // STATES + state_declaration(State, desc="Directory states", default="Directory_State_U") { + U, AccessPermission:Backing_Store, desc="unblocked"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + // BL is Busy because it is busy waiting for the data + // which is possibly in the network. The cache which evicted the data + // might have moved to some other state after doing the eviction + // BS==> Received a read request; has not requested ownership + // B==> Received a read request; has requested ownership + // BM==> Received a modification request + B_P, AccessPermission:Backing_Store, desc="Back invalidation, waiting for probes"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + } + + // Events + enumeration(Event, desc="Directory events") { + // CPU requests + RdBlkS, desc="..."; + RdBlkM, desc="..."; + RdBlk, desc="..."; + CtoD, desc="..."; + WriteThrough, desc="WriteThrough Message"; + Atomic, desc="Atomic Message"; + + // writebacks + VicDirty, desc="..."; + VicClean, desc="..."; + CPUData, desc="WB data from CPU"; + StaleWB, desc="Notification that WB has been superceded by a probe"; + + // probe responses + CPUPrbResp, desc="Probe Response Msg"; + + ProbeAcksComplete, desc="Probe Acks Complete"; + + L3Hit, desc="Hit in L3 return data to core"; + + // Replacement + PF_Repl, desc="Replace address from probe filter"; + + // Memory Controller + MemData, desc="Fetched data from memory arrives"; + WBAck, desc="Writeback Ack from memory arrives"; + + CoreUnblock, desc="Core received data, unblock"; + UnblockWriteThrough, desc="Unblock because of writethrough request finishing"; + + StaleVicDirty, desc="Core invalidated before VicDirty processed"; + } + + enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { + L3DataArrayRead, desc="Read the data array"; + L3DataArrayWrite, desc="Write the data array"; + L3TagArrayRead, desc="Read the data array"; + L3TagArrayWrite, desc="Write the data array"; + + PFTagArrayRead, desc="Read the data array"; + PFTagArrayWrite, desc="Write the data array"; + } + + // TYPES + + enumeration(ProbeFilterState, desc="") { + T, desc="Tracked"; + NT, desc="Not tracked"; + B, desc="Blocked, This entry is being replaced"; + } + + // DirectoryEntry + structure(Entry, desc="...", interface="AbstractEntry") { + State DirectoryState, desc="Directory state"; + DataBlock DataBlk, desc="data for the block"; + NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; + } + + structure(CacheEntry, desc="...", interface="AbstractCacheEntry") { + DataBlock DataBlk, desc="data for the block"; + MachineID LastSender, desc="Mach which this block came from"; + ProbeFilterState pfState, desc="ProbeFilter state",default="Directory_ProbeFilterState_NT"; + bool isOnCPU, desc="Block valid in the CPU complex",default="false"; + bool isOnGPU, desc="Block valid in the GPU complex",default="false"; + } + + structure(TBE, desc="...") { + State TBEState, desc="Transient state"; + DataBlock DataBlk, desc="data for the block"; + bool Dirty, desc="Is the data dirty?"; + int NumPendingAcks, desc="num acks expected"; + MachineID OriginalRequestor, desc="Original Requestor"; + MachineID WTRequestor, desc="WT Requestor"; + bool Cached, desc="data hit in Cache"; + bool MemData, desc="Got MemData?",default="false"; + bool wtData, desc="Got write through data?",default="false"; + bool atomicData, desc="Got Atomic op?",default="false"; + Cycles InitialRequestTime, desc="..."; + Cycles ForwardRequestTime, desc="..."; + Cycles ProbeRequestStartTime, desc="..."; + MachineID LastSender, desc="Mach which this block came from"; + bool L3Hit, default="false", desc="Was this an L3 hit?"; + uint64_t probe_id, desc="probe id for lifetime profiling"; + WriteMask writeMask, desc="outstanding write through mask"; + Addr demandAddress, desc="Address of demand request which caused probe filter eviction"; + } + + structure(TBETable, external="yes") { + TBE lookup(Addr); + void allocate(Addr); + void deallocate(Addr); + bool isPresent(Addr); + } + + TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs"; + + int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()"; + + Tick clockEdge(); + Tick cyclesToTicks(Cycles c); + + void set_tbe(TBE a); + void unset_tbe(); + void wakeUpAllBuffers(); + void wakeUpBuffers(Addr a); + Cycles curCycle(); + + Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" { + Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr)); + + if (is_valid(dir_entry)) { + //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk); + return dir_entry; + } + + dir_entry := static_cast(Entry, "pointer", + directory.allocate(addr, new Entry)); + return dir_entry; + } + + DataBlock getDataBlock(Addr addr), return_by_ref="yes" { + TBE tbe := TBEs.lookup(addr); + if (is_valid(tbe) && tbe.MemData) { + DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); + return tbe.DataBlk; + } + DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); + return getDirectoryEntry(addr).DataBlk; + } + + State getState(TBE tbe, CacheEntry entry, Addr addr) { + CacheEntry probeFilterEntry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(addr)); + if (inclusiveDir) { + if (is_valid(probeFilterEntry) && probeFilterEntry.pfState == ProbeFilterState:B) { + return State:B_P; + } + } + return getDirectoryEntry(addr).DirectoryState; + } + + void setState(TBE tbe, CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).DirectoryState := state; + } + + void functionalRead(Addr addr, Packet *pkt) { + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + testAndRead(addr, tbe.DataBlk, pkt); + } else { + functionalMemoryRead(pkt); + } + } + + int functionalWrite(Addr addr, Packet *pkt) { + int num_functional_writes := 0; + + TBE tbe := TBEs.lookup(addr); + if(is_valid(tbe)) { + num_functional_writes := num_functional_writes + + testAndWrite(addr, tbe.DataBlk, pkt); + } + + num_functional_writes := num_functional_writes + + functionalMemoryWrite(pkt); + return num_functional_writes; + } + + AccessPermission getAccessPermission(Addr addr) { + // For this Directory, all permissions are just tracked in Directory, since + // it's not possible to have something in TBE but not Dir, just keep track + // of state all in one place. + if (directory.isPresent(addr)) { + return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState); + } + + return AccessPermission:NotPresent; + } + + void setAccessPermission(CacheEntry entry, Addr addr, State state) { + getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state)); + } + + void recordRequestType(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:PFTagArrayRead) { + ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayRead, addr); + } else if (request_type == RequestType:PFTagArrayWrite) { + ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } + } + + bool checkResourceAvailable(RequestType request_type, Addr addr) { + if (request_type == RequestType:L3DataArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3DataArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr); + } else if (request_type == RequestType:L3TagArrayRead) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:L3TagArrayWrite) { + return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:PFTagArrayRead) { + return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:PFTagArrayWrite) { + return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else { + error("Invalid RequestType type in checkResourceAvailable"); + return true; + } + } + + bool isNotPresentProbeFilter(Addr address) { + if (ProbeFilterMemory.isTagPresent(address) || + ProbeFilterMemory.cacheAvail(address)) { + return false; + } + return true; + } + + bool isGPUSharer(Addr address) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (entry.pfState == ProbeFilterState:NT) { + return true; + } else if (entry.isOnGPU){ + return true; + } + return false; + } + + bool isCPUSharer(Addr address) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (entry.pfState == ProbeFilterState:NT) { + return true; + } else if (entry.isOnCPU){ + return true; + } + return false; + } + + + // ** OUT_PORTS ** + out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore); + out_port(responseNetwork_out, ResponseMsg, responseToCore); + + out_port(triggerQueue_out, TriggerMsg, triggerQueue); + out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue); + + // ** IN_PORTS ** + + // Trigger Queue + in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) { + if (triggerQueue_in.isReady(clockEdge())) { + peek(triggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:AcksComplete) { + trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe); + }else if (in_msg.Type == TriggerType:UnblockWriteThrough) { + trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) { + if (L3TriggerQueue_in.isReady(clockEdge())) { + peek(L3TriggerQueue_in, TriggerMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == TriggerType:L3Hit) { + trigger(Event:L3Hit, in_msg.addr, entry, tbe); + } else { + error("Unknown trigger msg"); + } + } + } + } + + // Unblock Network + in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) { + if (unblockNetwork_in.isReady(clockEdge())) { + peek(unblockNetwork_in, UnblockMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + trigger(Event:CoreUnblock, in_msg.addr, entry, tbe); + } + } + } + + // Core response network + in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) { + if (responseNetwork_in.isReady(clockEdge())) { + peek(responseNetwork_in, ResponseMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { + trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:CPUData) { + trigger(Event:CPUData, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { + trigger(Event:StaleWB, in_msg.addr, entry, tbe); + } else { + error("Unexpected response type"); + } + } + } + } + + // off-chip memory request/response is done + in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) { + if (memQueue_in.isReady(clockEdge())) { + peek(memQueue_in, MemoryMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (in_msg.Type == MemoryRequestType:MEMORY_READ) { + trigger(Event:MemData, in_msg.addr, entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { + trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + } else { + DPRINTF(RubySlicc, "%s\n", in_msg.Type); + error("Invalid message"); + } + } + } + } + + in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) { + if (requestNetwork_in.isReady(clockEdge())) { + peek(requestNetwork_in, CPURequestMsg) { + TBE tbe := TBEs.lookup(in_msg.addr); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); + if (inclusiveDir && isNotPresentProbeFilter(in_msg.addr)) { + Addr victim := ProbeFilterMemory.cacheProbe(in_msg.addr); + tbe := TBEs.lookup(victim); + entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(victim)); + trigger(Event:PF_Repl, victim, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlk) { + trigger(Event:RdBlk, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkS) { + trigger(Event:RdBlkS, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + trigger(Event:RdBlkM, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:WriteThrough) { + trigger(Event:WriteThrough, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + trigger(Event:Atomic, in_msg.addr, entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:VicDirty) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicDirty, in_msg.addr, entry, tbe); + } + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) { + DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr); + trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe); + } else { + DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:VicClean, in_msg.addr, entry, tbe); + } + } else { + error("Bad request message type"); + } + } + } + } + + // Actions + action(s_sendResponseS, "s", desc="send Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(es_sendResponseES, "es", desc="send Exclusive or Shared response") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + if (tbe.Cached) { + out_msg.State := CoherenceState:Shared; + } else { + out_msg.State := CoherenceState:Exclusive; + } + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + // write-through and atomics do not send an unblock ack back to the + // directory. Hence, directory has to generate a self unblocking + // message. Additionally, write through's does not require data + // in its response. Hence, write through is treated seperately from + // write-back and atomics + action(m_sendResponseM, "m", desc="send Modified response") { + if (tbe.wtData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + }else{ + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + if (tbe.L3Hit) { + out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0)); + } else { + out_msg.Sender := machineID; + } + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.DataBlk := tbe.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := tbe.Dirty; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := false; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := tbe.ForwardRequestTime; + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.OriginalResponder := tbe.LastSender; + if(tbe.atomicData){ + out_msg.WTRequestor := tbe.WTRequestor; + } + out_msg.L3Hit := tbe.L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + if (tbe.atomicData) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:UnblockWriteThrough; + } + } + } + } + + action(c_sendResponseCtoD, "c", desc="send CtoD Ack") { + enqueue(responseNetwork_out, ResponseMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysResp; + out_msg.Sender := machineID; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Modified; + out_msg.CtoD := true; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + + action(w_sendResponseWBAck, "w", desc="send WB Ack") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(in_msg.Requestor); + out_msg.WTRequestor := in_msg.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + } + } + } + + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { + peek(responseNetwork_in, ResponseMsg) { + queueMemoryWrite(machineID, address, to_memory_controller_latency, + in_msg.DataBlk); + } + } + + action(l_queueMemRdReq, "lr", desc="Read data from memory") { + peek(requestNetwork_in, CPURequestMsg) { + if (L3CacheMemory.isTagPresent(address)) { + enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) { + out_msg.addr := address; + out_msg.Type := TriggerType:L3Hit; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + tbe.DataBlk := entry.DataBlk; + tbe.LastSender := entry.LastSender; + tbe.L3Hit := true; + tbe.MemData := true; + L3CacheMemory.deallocate(address); + } else { + queueMemoryRead(machineID, address, to_memory_controller_latency); + } + } + } + + action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + + // add relevant TCC node to list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if ((in_msg.Type == CoherenceRequestType:WriteThrough || + in_msg.Type == CoherenceRequestType:Atomic) && + in_msg.NoWriteConflict) { + // Don't Include TCCs unless there was write-CAB conflict in the TCC + } else if(noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", out_msg); + APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(bp_backProbe, "bp", desc="back probe") { + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + // won't be realistic for multisocket + out_msg.Destination.broadcast(MachineType:CorePair); + } + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && CAB_TCC) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + } + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + APPEND_TRANSITION_COMMENT(" - back probe"); + tbe.ProbeRequestStartTime := curCycle(); + } + } + + action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbDowngrade; + out_msg.ReturnData := true; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + //Don't need to notify TCC about reads + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + tbe.NumPendingAcks := tbe.NumPendingAcks + 1; + } + if (noTCCdir && CAB_TCC) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + DPRINTF(RubySlicc, "%s\n", (out_msg)); + APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") { + peek(requestNetwork_in, CPURequestMsg) { // not the right network? + enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) { + out_msg.addr := address; + out_msg.Type := ProbeRequestType:PrbInv; + out_msg.ReturnData := false; + out_msg.MessageSize := MessageSizeType:Control; + if(isCPUSharer(address)) { + out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket + } + + // add relevant TCC node to the list. This replaces all TCPs and SQCs + if(isGPUSharer(address)) { + if (noTCCdir) { + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + } else { + out_msg.Destination.add(map_Address_to_TCCdir(address)); + } + } + out_msg.Destination.remove(in_msg.Requestor); + tbe.NumPendingAcks := out_msg.Destination.count(); + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + DPRINTF(RubySlicc, "%s\n", out_msg); + tbe.ProbeRequestStartTime := curCycle(); + } + } + } + + action(sm_setMRU, "sm", desc="set probe filter entry as MRU") { + ProbeFilterMemory.setMRU(address); + } + + action(d_writeDataToMemory, "d", desc="Write data to memory") { + peek(responseNetwork_in, ResponseMsg) { + getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk, + in_msg.addr); + } + } + + action(te_allocateTBEForEviction, "te", desc="allocate TBE Entry") { + check_allocate(TBEs); + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + tbe.writeMask.clear(); + tbe.wtData := false; + tbe.atomicData := false; + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + tbe.NumPendingAcks := 0; + } + + action(t_allocateTBE, "t", desc="allocate TBE Entry") { + check_allocate(TBEs); + peek(requestNetwork_in, CPURequestMsg) { + TBEs.allocate(address); + set_tbe(TBEs.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.wtData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + if (in_msg.Type == CoherenceRequestType:Atomic) { + tbe.writeMask.clear(); + tbe.writeMask.orMask(in_msg.writeMask); + tbe.atomicData := true; + tbe.WTRequestor := in_msg.WTRequestor; + tbe.LastSender := in_msg.Requestor; + } + tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs + tbe.Dirty := false; + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask); + tbe.Dirty := false; + } + tbe.OriginalRequestor := in_msg.Requestor; + tbe.NumPendingAcks := 0; + tbe.Cached := in_msg.ForceShared; + tbe.InitialRequestTime := in_msg.InitialRequestTime; + } + } + + action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { + if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + TBEs.deallocate(address); + unset_tbe(); + } + + action(wd_writeBackData, "wd", desc="Write back data if needed") { + if (tbe.wtData) { + DataBlock tmp := getDirectoryEntry(address).DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk, + tbe.writeMask); + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } else if (tbe.Dirty == false) { + getDirectoryEntry(address).DataBlk := tbe.DataBlk; + } + } + + action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { + peek(memQueue_in, MemoryMsg) { + if (tbe.wtData == true) { + // DO Nothing (already have the directory data) + } else if (tbe.Dirty == false) { + tbe.DataBlk := getDirectoryEntry(address).DataBlk; + } + tbe.MemData := true; + } + } + + action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") { + peek(responseNetwork_in, ResponseMsg) { + if (in_msg.Dirty) { + DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender); + DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk); + if (tbe.wtData) { + DataBlock tmp := in_msg.DataBlk; + tmp.copyPartial(tbe.DataBlk,tbe.writeMask); + tbe.DataBlk := tmp; + } else if (tbe.Dirty) { + if(tbe.atomicData == false && tbe.wtData == false) { + DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender); + assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data + } + } else { + tbe.DataBlk := in_msg.DataBlk; + tbe.Dirty := in_msg.Dirty; + tbe.LastSender := in_msg.Sender; + } + } + if (in_msg.Hit) { + tbe.Cached := true; + } + } + } + + action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") { + peek(responseNetwork_in, ResponseMsg) { + DPRINTF(RubySlicc, "Write cancel bit set on address %s\n", address); + getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender); + APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty "); + } + } + + action(x_decrementAcks, "x", desc="decrement Acks pending") { + tbe.NumPendingAcks := tbe.NumPendingAcks - 1; + APPEND_TRANSITION_COMMENT(" Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(o_checkForCompletion, "o", desc="check for ack completion") { + if (tbe.NumPendingAcks == 0) { + enqueue(triggerQueue_out, TriggerMsg, 1) { + out_msg.addr := address; + out_msg.Type := TriggerType:AcksComplete; + } + } + APPEND_TRANSITION_COMMENT(" Check: Acks remaining: "); + APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); + } + + action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") { + peek(requestNetwork_in, CPURequestMsg) { + getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor); + } + } + + action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { + peek(responseNetwork_in, ResponseMsg) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := in_msg.DataBlk; + entry.LastSender := in_msg.Sender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := in_msg.DataBlk; + + entry.LastSender := in_msg.Sender; + } + } + } + + action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") { + if ((tbe.wtData || tbe.atomicData) && useL3OnWT) { + if (L3CacheMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } else { + if (L3CacheMemory.cacheAvail(address) == false) { + Addr victim := L3CacheMemory.cacheProbe(address); + CacheEntry victim_entry := static_cast(CacheEntry, "pointer", + L3CacheMemory.lookup(victim)); + queueMemoryWrite(machineID, victim, to_memory_controller_latency, + victim_entry.DataBlk); + L3CacheMemory.deallocate(victim); + } + assert(L3CacheMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); + entry.DataBlk := tbe.DataBlk; + entry.LastSender := tbe.LastSender; + } + } + } + + action(apf_allocateProbeFilterEntry, "apf", desc="Allocate probe filte entry") { + if (!ProbeFilterMemory.isTagPresent(address)) { + if (inclusiveDir) { + assert(ProbeFilterMemory.cacheAvail(address)); + } else if (ProbeFilterMemory.cacheAvail(address) == false) { + Addr victim := ProbeFilterMemory.cacheProbe(address); + ProbeFilterMemory.deallocate(victim); + } + assert(ProbeFilterMemory.cacheAvail(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.allocate(address, new CacheEntry)); + APPEND_TRANSITION_COMMENT(" allocating a new probe filter entry"); + entry.pfState := ProbeFilterState:NT; + if (inclusiveDir) { + entry.pfState := ProbeFilterState:T; + } + entry.isOnCPU := false; + entry.isOnGPU := false; + } + } + + action(mpfe_markPFEntryForEviction, "mpfe", desc="Mark this PF entry is being evicted") { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + entry.pfState := ProbeFilterState:B; + peek(requestNetwork_in, CPURequestMsg) { + tbe.demandAddress := in_msg.addr; + } + } + + action(we_wakeUpEvictionDependents, "we", desc="Wake up requests waiting for demand address and victim address") { + wakeUpBuffers(address); + wakeUpBuffers(tbe.demandAddress); + } + + action(dpf_deallocateProbeFilter, "dpf", desc="deallocate PF entry") { + assert(ProbeFilterMemory.isTagPresent(address)); + ProbeFilterMemory.deallocate(address); + } + + action(upf_updateProbeFilter, "upf", desc="") { + peek(requestNetwork_in, CPURequestMsg) { + assert(ProbeFilterMemory.isTagPresent(address)); + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if (in_msg.Type == CoherenceRequestType:WriteThrough) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:Atomic) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:RdBlkM) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } else if (in_msg.Type == CoherenceRequestType:CtoD) { + entry.pfState := ProbeFilterState:T; + entry.isOnCPU := false; + entry.isOnGPU := false; + } + if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) { + entry.isOnCPU := true; + } else { + entry.isOnGPU := true; + } + } + } + + action(rmcd_removeSharerConditional, "rmcd", desc="remove sharer from probe Filter, conditional") { + peek(requestNetwork_in, CPURequestMsg) { + if (ProbeFilterMemory.isTagPresent(address)) { + CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address)); + if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {//CorePair has inclusive L2 + if (in_msg.Type == CoherenceRequestType:VicDirty) { + entry.isOnCPU := false; + } else if (in_msg.Type == CoherenceRequestType:VicClean) { + entry.isOnCPU := false; + } + } + } + } + } + + action(sf_setForwardReqTime, "sf", desc="...") { + tbe.ForwardRequestTime := curCycle(); + } + + action(dl_deallocateL3, "dl", desc="deallocate the L3 block") { + L3CacheMemory.deallocate(address); + } + + action(p_popRequestQueue, "p", desc="pop request queue") { + requestNetwork_in.dequeue(clockEdge()); + } + + action(pr_popResponseQueue, "pr", desc="pop response queue") { + responseNetwork_in.dequeue(clockEdge()); + } + + action(pm_popMemQueue, "pm", desc="pop mem queue") { + memQueue_in.dequeue(clockEdge()); + } + + action(pt_popTriggerQueue, "pt", desc="pop trigger queue") { + triggerQueue_in.dequeue(clockEdge()); + } + + action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") { + L3TriggerQueue_in.dequeue(clockEdge()); + } + + action(pu_popUnblockQueue, "pu", desc="pop unblock queue") { + unblockNetwork_in.dequeue(clockEdge()); + } + + action(zz_recycleRequestQueue, "zz", desc="recycle request queue") { + requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(yy_recycleResponseQueue, "yy", desc="recycle response queue") { + responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency)); + } + + action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") { + stall_and_wait(requestNetwork_in, address); + } + + action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") { + wakeUpBuffers(address); + } + + action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") { + wakeUpAllBuffers(); + } + + action(z_stall, "z", desc="...") { + } + + // TRANSITIONS + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + st_stallAndWaitRequest; + } + + // It may be possible to save multiple invalidations here! + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {Atomic, WriteThrough}) { + st_stallAndWaitRequest; + } + + + // transitions from U + transition(U, PF_Repl, B_P) {PFTagArrayRead, PFTagArrayWrite}{ + te_allocateTBEForEviction; + apf_allocateProbeFilterEntry; + bp_backProbe; + sm_setMRU; + mpfe_markPFEntryForEviction; + } + + transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + sc_probeShrCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + w_sendResponseWBAck; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + dc_probeInvCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, RdBlk, B_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite}{ + t_allocateTBE; + apf_allocateProbeFilterEntry; + l_queueMemRdReq; + sc_probeShrCoreData; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, CtoD, BP) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} { + t_allocateTBE; + apf_allocateProbeFilterEntry; + ic_probeInvCore; + sm_setMRU; + upf_updateProbeFilter; + p_popRequestQueue; + } + + transition(U, VicDirty, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + rmcd_removeSharerConditional; + p_popRequestQueue; + } + + transition(U, VicClean, BL) {L3TagArrayRead} { + t_allocateTBE; + w_sendResponseWBAck; + rmcd_removeSharerConditional; + p_popRequestQueue; + } + + transition(BL, {VicDirty, VicClean}) { + zz_recycleRequestQueue; + } + + transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + d_writeDataToMemory; + al_allocateL3Block; + wa_wakeUpDependents; + dt_deallocateTBE; + //l_queueMemWBReq; // why need an ack? esp. with DRAMSim, just put it in queue no ack needed + pr_popResponseQueue; + } + + transition(BL, StaleWB, U) {L3TagArrayWrite} { + dt_deallocateTBE; + wa_wakeUpAllDependents; + pr_popResponseQueue; + } + + transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P}, {VicDirty, VicClean}) { + z_stall; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, WBAck) { + pm_popMemQueue; + } + + transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, PF_Repl) { + zz_recycleRequestQueue; + } + + transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, StaleVicDirty) { + rv_removeVicDirtyIgnore; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition({B}, CoreUnblock, U) { + wa_wakeUpDependents; + pu_popUnblockQueue; + } + + transition(B, UnblockWriteThrough, U) { + wa_wakeUpDependents; + pt_popTriggerQueue; + } + + transition(BS_PM, MemData, BS_Pm) {} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BM_PM, MemData, BM_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(B_PM, MemData, B_Pm){} { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + + transition(BS_PM, L3Hit, BS_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BM_PM, L3Hit, BM_Pm) {} { + ptl_popTriggerQueue; + } + + transition(B_PM, L3Hit, B_Pm) {} { + ptl_popTriggerQueue; + } + + transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + mt_writeMemDataToTBE; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + ptl_popTriggerQueue; + } + + transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, BP}, CPUPrbResp) { + y_writeProbeDataToTBE; + x_decrementAcks; + o_checkForCompletion; + pr_popResponseQueue; + } + + transition(BS_PM, ProbeAcksComplete, BS_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BM_PM, ProbeAcksComplete, BM_M) {} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(B_PM, ProbeAcksComplete, B_M){} { + sf_setForwardReqTime; + pt_popTriggerQueue; + } + + transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + s_sendResponseS; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + m_sendResponseM; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + es_sendResponseES; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(B_P, ProbeAcksComplete, U) { + wd_writeBackData; + alwt_allocateL3BlockOnWT; + we_wakeUpEvictionDependents; + dpf_deallocateProbeFilter; + dt_deallocateTBE; + pt_popTriggerQueue; + } + + transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + sf_setForwardReqTime; + c_sendResponseCtoD; + wd_writeBackData; + alwt_allocateL3BlockOnWT; + dt_deallocateTBE; + pt_popTriggerQueue; + } +} diff --git a/src/mem/protocol/MOESI_AMD_Base.slicc b/src/mem/protocol/MOESI_AMD_Base.slicc new file mode 100644 index 000000000..b38145246 --- /dev/null +++ b/src/mem/protocol/MOESI_AMD_Base.slicc @@ -0,0 +1,6 @@ +protocol "MOESI_AMD_Base"; +include "RubySlicc_interfaces.slicc"; +include "MOESI_AMD_Base-msg.sm"; +include "MOESI_AMD_Base-CorePair.sm"; +include "MOESI_AMD_Base-L3cache.sm"; +include "MOESI_AMD_Base-dir.sm"; diff --git a/src/mem/protocol/RubySlicc_ComponentMapping.sm b/src/mem/protocol/RubySlicc_ComponentMapping.sm index a72492b42..e1d7c4399 100644 --- a/src/mem/protocol/RubySlicc_ComponentMapping.sm +++ b/src/mem/protocol/RubySlicc_ComponentMapping.sm @@ -37,7 +37,10 @@ MachineID mapAddressToRange(Addr addr, MachineType type, NetDest broadcast(MachineType type); MachineID map_Address_to_DMA(Addr addr); MachineID map_Address_to_Directory(Addr addr); +MachineID map_Address_to_RegionDir(Addr addr); NodeID map_Address_to_DirectoryNode(Addr addr); +MachineID map_Address_to_TCCdir(Addr addr); +NodeID map_Address_to_TCCdirNode(Addr addr); NodeID machineIDToNodeID(MachineID machID); NodeID machineIDToVersion(MachineID machID); MachineType machineIDToMachineType(MachineID machID); diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm index 5ee26d65c..c743ebe28 100644 --- a/src/mem/protocol/RubySlicc_Exports.sm +++ b/src/mem/protocol/RubySlicc_Exports.sm @@ -62,7 +62,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt); // AccessPermission // The following five states define the access permission of all memory blocks. -// These permissions have multiple uses. They coordinate locking and +// These permissions have multiple uses. They coordinate locking and // synchronization primitives, as well as enable functional accesses. // One should not need to add any additional permission values and it is very // risky to do so. @@ -73,7 +73,7 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") Read_Write, desc="block is Read/Write"; // Possibly Invalid data - // The maybe stale permission indicates that accordingly to the protocol, + // The maybe stale permission indicates that accordingly to the protocol, // there is no guarantee the block contains valid data. However, functional // writes should update the block because a dataless PUT request may // revalidate the block's data. @@ -227,6 +227,13 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") { Collector, desc="Collector Mach"; L1Cache_wCC, desc="L1 Cache Mach to track cache-to-cache transfer (used for miss latency profile)"; L2Cache_wCC, desc="L2 Cache Mach to track cache-to-cache transfer (used for miss latency profile)"; + CorePair, desc="Cache Mach (2 cores, Private L1Ds, Shared L1I & L2)"; + TCP, desc="GPU L1 Data Cache (Texture Cache per Pipe)"; + TCC, desc="GPU L2 Shared Cache (Texture Cache per Channel)"; + TCCdir, desc="Directory at the GPU L2 Cache (TCC)"; + SQC, desc="GPU L1 Instr Cache (Sequencer Cache)"; + RegionDir, desc="Region-granular directory"; + RegionBuffer,desc="Region buffer for CPU and GPU"; NULL, desc="null mach type"; } diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm index a6c57e1b0..b8d284725 100644 --- a/src/mem/protocol/RubySlicc_Types.sm +++ b/src/mem/protocol/RubySlicc_Types.sm @@ -31,8 +31,8 @@ // // **PLEASE NOTE!** When adding objects to this file you must also add a line -// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh -// file will not be copied to the protocol directory and you will encounter a +// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh +// file will not be copied to the protocol directory and you will encounter a // undefined declaration error. // @@ -95,6 +95,8 @@ structure (NetDest, external = "yes", non_obj="yes") { bool intersectionIsEmpty(Set); bool intersectionIsEmpty(NetDest); MachineID smallestElement(MachineType); + NetDest OR(NetDest); + NetDest AND(NetDest); } structure (Sequencer, external = "yes") { @@ -117,6 +119,44 @@ structure (Sequencer, external = "yes") { void invalidateSC(Addr); } +structure (GPUCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void checkCoherence(Addr); + void evictionCallback(Addr); + void recordCPReadCallBack(MachineID, MachineID); + void recordCPWriteCallBack(MachineID, MachineID); +} + +structure (VIPERCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void invCallback(Addr); + void wbCallback(Addr); + void checkCoherence(Addr); + void evictionCallback(Addr); +} + structure(RubyRequest, desc="...", interface="Message", external="yes") { Addr LineAddress, desc="Line address for this request"; Addr PhysicalAddress, desc="Physical address for this request"; @@ -161,6 +201,7 @@ structure (CacheMemory, external = "yes") { Cycles getTagLatency(); Cycles getDataLatency(); void setMRU(Addr); + void setMRU(Addr, int); void setMRU(AbstractCacheEntry); void recordRequestType(CacheRequestType, Addr); bool checkResourceAvailable(CacheResourceType, Addr); diff --git a/src/mem/protocol/SConsopts b/src/mem/protocol/SConsopts index ca432a73e..47b36e276 100644 --- a/src/mem/protocol/SConsopts +++ b/src/mem/protocol/SConsopts @@ -33,6 +33,11 @@ import os Import('*') all_protocols.extend([ + 'GPU_VIPER', + 'GPU_VIPER_Baseline', + 'GPU_VIPER_Region', + 'GPU_RfO', + 'MOESI_AMD_Base', 'MESI_Two_Level', 'MESI_Three_Level', 'MI_example', diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript index 16e932432..82a16c9b0 100644 --- a/src/mem/ruby/SConscript +++ b/src/mem/ruby/SConscript @@ -124,13 +124,20 @@ MakeInclude('common/Set.hh') MakeInclude('common/WriteMask.hh') MakeInclude('filters/AbstractBloomFilter.hh') MakeInclude('network/MessageBuffer.hh') -MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/CacheMemory.hh') -MakeInclude('system/DMASequencer.hh') MakeInclude('structures/DirectoryMemory.hh') -MakeInclude('structures/WireBuffer.hh') MakeInclude('structures/PerfectCacheMemory.hh') MakeInclude('structures/PersistentTable.hh') -MakeInclude('system/Sequencer.hh') +MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/TBETable.hh') MakeInclude('structures/TimerTable.hh') +MakeInclude('structures/WireBuffer.hh') +MakeInclude('system/DMASequencer.hh') +MakeInclude('system/Sequencer.hh') + +# External types : Group "mem/protocol" : include "header.hh" to the bottom +# of this MakeIncludes if it is referenced as +# <# include "mem/protocol/header.hh"> in any file +# generated_dir = Dir('../protocol') +MakeInclude('system/GPUCoalescer.hh') +MakeInclude('system/VIPERCoalescer.hh') diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc index b3b37e5a6..7d3f20982 100644 --- a/src/mem/ruby/profiler/Profiler.cc +++ b/src/mem/ruby/profiler/Profiler.cc @@ -269,7 +269,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { m_outstandReqHist.add(seq->getOutstandReqHist()); } @@ -282,7 +282,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { // add all the latencies m_latencyHist.add(seq->getLatencyHist()); diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh index 926556781..cbd068c04 100644 --- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh +++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh @@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry virtual DataBlock& getDataBlk() { panic("getDataBlk() not implemented!"); } + int validBlocks; + virtual int& getNumValidBlocks() + { + return validBlocks; + } + // Functions for locking and unlocking the cache entry. These are required // for supporting atomic memory accesses. void setLocked(int context); diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc index 93fe50c88..458fde5bc 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr) } } +bool +AbstractController::isBlocked(Addr addr) +{ + return (m_block_map.count(addr) > 0); +} + BaseMasterPort & AbstractController::getMasterPort(const std::string &if_name, PortID idx) diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 383507eed..4488ee3f4 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer // return instance name void blockOnQueue(Addr, MessageBuffer*); void unblock(Addr); + bool isBlocked(Addr); virtual MessageBuffer* getMandatoryQueue() const = 0; virtual MessageBuffer* getMemoryQueue() const = 0; @@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer virtual void regStats(); virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0; - virtual Sequencer* getSequencer() const = 0; + virtual Sequencer* getCPUSequencer() const = 0; //! These functions are used by ruby system to read/write the data blocks //! that exist with in the controller. diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh index 46071335e..cdedc2e14 100644 --- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh +++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh @@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr) return DirectoryMemory::mapAddressToDirectoryVersion(addr); } +inline NodeID +map_Address_to_TCCdirNode(Addr addr) +{ + return DirectoryMemory::mapAddressToDirectoryVersion(addr); +} + // used to determine the home directory // returns a value between 0 and total_directories_within_the_system inline MachineID @@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr) return mach; } +inline MachineID +map_Address_to_RegionDir(Addr addr) +{ + MachineID mach = {MachineType_RegionDir, + map_Address_to_DirectoryNode(addr)}; + return mach; +} + +inline MachineID +map_Address_to_TCCdir(Addr addr) +{ + MachineID mach = + {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)}; + return mach; +} + inline NetDest broadcast(MachineType type) { @@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id) return mach; } +inline MachineID +MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node) +{ + MachineID mach = {type, node}; + return mach; +} + #endif // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__ diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index a8a3ba949..45fb85d05 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -35,6 +35,7 @@ #include "mem/protocol/AccessPermission.hh" #include "mem/ruby/structures/CacheMemory.hh" #include "mem/ruby/system/RubySystem.hh" +#include "mem/ruby/system/WeightedLRUPolicy.hh" using namespace std; @@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p) m_start_index_bit = p->start_index_bit; m_is_instruction_only_cache = p->is_icache; m_resource_stalls = p->resourceStalls; + m_block_size = p->block_size; // may be 0 at this point. Updated in init() } void CacheMemory::init() { - m_cache_num_sets = (m_cache_size / m_cache_assoc) / - RubySystem::getBlockSizeBytes(); + if (m_block_size == 0) { + m_block_size = RubySystem::getBlockSizeBytes(); + } + m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size; assert(m_cache_num_sets > 1); m_cache_num_set_bits = floorLog2(m_cache_num_sets); assert(m_cache_num_set_bits > 0); - m_cache.resize(m_cache_num_sets); - for (int i = 0; i < m_cache_num_sets; i++) { - m_cache[i].resize(m_cache_assoc); - for (int j = 0; j < m_cache_assoc; j++) { - m_cache[i][j] = NULL; - } - } + m_cache.resize(m_cache_num_sets, + std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr)); } CacheMemory::~CacheMemory() { - if (m_replacementPolicy_ptr != NULL) + if (m_replacementPolicy_ptr) delete m_replacementPolicy_ptr; for (int i = 0; i < m_cache_num_sets; i++) { for (int j = 0; j < m_cache_assoc; j++) { @@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e) } void +CacheMemory::setMRU(Addr address, int occupancy) +{ + int64_t cacheSet = addressToCacheSet(address); + int loc = findTagInSet(cacheSet, address); + + if(loc != -1) { + if (m_replacementPolicy_ptr->useOccupancy()) { + (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))-> + touch(cacheSet, loc, curTick(), occupancy); + } else { + m_replacementPolicy_ptr-> + touch(cacheSet, loc, curTick()); + } + } +} + +int +CacheMemory::getReplacementWeight(int64_t set, int64_t loc) +{ + assert(set < m_cache_num_sets); + assert(loc < m_cache_assoc); + int ret = 0; + if(m_cache[set][loc] != NULL) { + ret = m_cache[set][loc]->getNumValidBlocks(); + assert(ret >= 0); + } + + return ret; +} + +void CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const { uint64_t warmedUpBlocks = 0; diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh index 72805b32b..5b30505d3 100644 --- a/src/mem/ruby/structures/CacheMemory.hh +++ b/src/mem/ruby/structures/CacheMemory.hh @@ -106,7 +106,8 @@ class CacheMemory : public SimObject // Set this address to most recently used void setMRU(Addr address); - // Set this entry to most recently used + void setMRU(Addr addr, int occupancy); + int getReplacementWeight(int64_t set, int64_t loc); void setMRU(const AbstractCacheEntry *e); // Functions for locking and unlocking cache lines corresponding to the @@ -146,6 +147,7 @@ class CacheMemory : public SimObject Stats::Scalar numDataArrayStalls; int getCacheSize() const { return m_cache_size; } + int getCacheAssoc() const { return m_cache_assoc; } int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; } Addr getAddressAtIdx(int idx) const; @@ -182,6 +184,7 @@ class CacheMemory : public SimObject int m_cache_assoc; int m_start_index_bit; bool m_resource_stalls; + int m_block_size; }; std::ostream& operator<<(std::ostream& out, const CacheMemory& obj); diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py index 4eb87ac74..9fc4726b0 100644 --- a/src/mem/ruby/structures/RubyCache.py +++ b/src/mem/ruby/structures/RubyCache.py @@ -42,6 +42,7 @@ class RubyCache(SimObject): "") start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line"); is_icache = Param.Bool(False, "is instruction only cache"); + block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize") dataArrayBanks = Param.Int(1, "Number of banks for the data array") tagArrayBanks = Param.Int(1, "Number of banks for the tag array") diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc new file mode 100644 index 000000000..db279bd3a --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/GPUCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "debug/ProtocolTrace.hh" +#include "debug/RubyPort.hh" +#include "debug/RubyStats.hh" +#include "gpu-compute/shader.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/DataBlock.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/RubyGPUCoalescer.hh" + +using namespace std; + +GPUCoalescer * +RubyGPUCoalescerParams::create() +{ + return new GPUCoalescer(this); +} + +HSAScope +reqScopeToHSAScope(Request* req) +{ + HSAScope accessScope = HSAScope_UNSPECIFIED; + if (req->isScoped()) { + if (req->isWavefrontScope()) { + accessScope = HSAScope_WAVEFRONT; + } else if (req->isWorkgroupScope()) { + accessScope = HSAScope_WORKGROUP; + } else if (req->isDeviceScope()) { + accessScope = HSAScope_DEVICE; + } else if (req->isSystemScope()) { + accessScope = HSAScope_SYSTEM; + } else { + fatal("Bad scope type"); + } + } + return accessScope; +} + +HSASegment +reqSegmentToHSASegment(Request* req) +{ + HSASegment accessSegment = HSASegment_GLOBAL; + + if (req->isGlobalSegment()) { + accessSegment = HSASegment_GLOBAL; + } else if (req->isGroupSegment()) { + accessSegment = HSASegment_GROUP; + } else if (req->isPrivateSegment()) { + accessSegment = HSASegment_PRIVATE; + } else if (req->isKernargSegment()) { + accessSegment = HSASegment_KERNARG; + } else if (req->isReadonlySegment()) { + accessSegment = HSASegment_READONLY; + } else if (req->isSpillSegment()) { + accessSegment = HSASegment_SPILL; + } else if (req->isArgSegment()) { + accessSegment = HSASegment_ARG; + } else { + fatal("Bad segment type"); + } + + return accessSegment; +} + +GPUCoalescer::GPUCoalescer(const Params *p) + : RubyPort(p), issueEvent(this), deadlockCheckEvent(this) +{ + m_store_waiting_on_load_cycles = 0; + m_store_waiting_on_store_cycles = 0; + m_load_waiting_on_store_cycles = 0; + m_load_waiting_on_load_cycles = 0; + + m_outstanding_count = 0; + + m_max_outstanding_requests = 0; + m_deadlock_threshold = 0; + m_instCache_ptr = nullptr; + m_dataCache_ptr = nullptr; + + m_instCache_ptr = p->icache; + m_dataCache_ptr = p->dcache; + m_max_outstanding_requests = p->max_outstanding_requests; + m_deadlock_threshold = p->deadlock_threshold; + + assert(m_max_outstanding_requests > 0); + assert(m_deadlock_threshold > 0); + assert(m_instCache_ptr); + assert(m_dataCache_ptr); + + m_data_cache_hit_latency = p->dcache_hit_latency; + + m_usingNetworkTester = p->using_network_tester; + assumingRfOCoherence = p->assume_rfo; +} + +GPUCoalescer::~GPUCoalescer() +{ +} + +void +GPUCoalescer::wakeup() +{ + // Check for deadlock of any of the requests + Cycles current_time = curCycle(); + + // Check across all outstanding requests + int total_outstanding = 0; + + RequestTable::iterator read = m_readRequestTable.begin(); + RequestTable::iterator read_end = m_readRequestTable.end(); + for (; read != read_end; ++read) { + GPUCoalescerRequest* request = read->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_readRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_readRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time)*clockPeriod()); + } + + RequestTable::iterator write = m_writeRequestTable.begin(); + RequestTable::iterator write_end = m_writeRequestTable.end(); + for (; write != write_end; ++write) { + GPUCoalescerRequest* request = write->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_writeRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_writeRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time) * clockPeriod()); + } + + total_outstanding += m_writeRequestTable.size(); + total_outstanding += m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + if (m_outstanding_count > 0) { + // If there are still outstanding requests, keep checking + schedule(deadlockCheckEvent, + m_deadlock_threshold * clockPeriod() + + curTick()); + } +} + +void +GPUCoalescer::resetStats() +{ + m_latencyHist.reset(); + m_missLatencyHist.reset(); + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist[i]->reset(); + m_missTypeLatencyHist[i]->reset(); + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i][j]->reset(); + } + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist[i]->reset(); + + m_IssueToInitialDelayHist[i]->reset(); + m_InitialToForwardDelayHist[i]->reset(); + m_ForwardToFirstResponseDelayHist[i]->reset(); + m_FirstResponseToCompletionDelayHist[i]->reset(); + } +} + +void +GPUCoalescer::printProgress(ostream& out) const +{ +} + +RequestStatus +GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) +{ + Addr line_addr = makeLineAddress(pkt->getAddr()); + + if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { + return RequestStatus_BufferFull; + } + + if(m_controller->isBlocked(line_addr) && + request_type != RubyRequestType_Locked_RMW_Write) { + return RequestStatus_Aliased; + } + + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + // Check if there is any outstanding read request for the same + // cache line. + if (m_readRequestTable.count(line_addr) > 0) { + m_store_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + + if (m_writeRequestTable.count(line_addr) > 0) { + // There is an outstanding write request for the cache line + m_store_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + } else { + // Check if there is any outstanding write request for the same + // cache line. + if (m_writeRequestTable.count(line_addr) > 0) { + m_load_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + + if (m_readRequestTable.count(line_addr) > 0) { + // There is an outstanding read request for the cache line + m_load_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + } + + return RequestStatus_Ready; + +} + + + +// sets the kernelEndList +void +GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) +{ + // Don't know if this will happen or is possible + // but I just want to be careful and not have it become + // simulator hang in the future + DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); + assert(kernelEndList.count(wavefront_id) == 0); + + kernelEndList[wavefront_id] = pkt; + DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", + kernelEndList.size()); +} + + +// Insert the request on the correct request table. Return true if +// the entry was already present. +bool +GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) +{ + assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || + pkt->req->isLockedRMW() || + !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); + + int total_outstanding M5_VAR_USED = + m_writeRequestTable.size() + m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + // See if we should schedule a deadlock check + if (deadlockCheckEvent.scheduled() == false) { + schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); + } + + Addr line_addr = makeLineAddress(pkt->getAddr()); + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + pair<RequestTable::iterator, bool> r = + m_writeRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting write request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } else { + pair<RequestTable::iterator, bool> r = + m_readRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting read request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } + + m_outstandReqHist.sample(m_outstanding_count); + + total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); + assert(m_outstanding_count == total_outstanding); + + return false; +} + +void +GPUCoalescer::markRemoved() +{ + m_outstanding_count--; + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); +} + +void +GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) +{ + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); + + Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); + if ((srequest->m_type == RubyRequestType_ST) || + (srequest->m_type == RubyRequestType_RMW_Read) || + (srequest->m_type == RubyRequestType_RMW_Write) || + (srequest->m_type == RubyRequestType_Load_Linked) || + (srequest->m_type == RubyRequestType_Store_Conditional) || + (srequest->m_type == RubyRequestType_Locked_RMW_Read) || + (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { + m_writeRequestTable.erase(line_addr); + } else { + m_readRequestTable.erase(line_addr); + } + + markRemoved(); +} + +bool +GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) +{ + // + // The success flag indicates whether the LLSC operation was successful. + // LL ops will always succeed, but SC may fail if the cache line is no + // longer locked. + // + bool success = true; + if (request->m_type == RubyRequestType_Store_Conditional) { + if (!m_dataCache_ptr->isLocked(address, m_version)) { + // + // For failed SC requests, indicate the failure to the cpu by + // setting the extra data to zero. + // + request->pkt->req->setExtraData(0); + success = false; + } else { + // + // For successful SC requests, indicate the success to the cpu by + // setting the extra data to one. + // + request->pkt->req->setExtraData(1); + } + // + // Independent of success, all SC operations must clear the lock + // + m_dataCache_ptr->clearLocked(address); + } else if (request->m_type == RubyRequestType_Load_Linked) { + // + // Note: To fully follow Alpha LLSC semantics, should the LL clear any + // previously locked cache lines? + // + m_dataCache_ptr->setLocked(address, m_version); + } else if ((m_dataCache_ptr->isTagPresent(address)) && + (m_dataCache_ptr->isLocked(address, m_version))) { + // + // Normal writes should clear the locked address + // + m_dataCache_ptr->clearLocked(address); + } + return success; +} + +void +GPUCoalescer::writeCallback(Addr address, DataBlock& data) +{ + writeCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + writeCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_ST) || + (request->m_type == RubyRequestType_ATOMIC) || + (request->m_type == RubyRequestType_ATOMIC_RETURN) || + (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request->m_type == RubyRequestType_RMW_Read) || + (request->m_type == RubyRequestType_RMW_Write) || + (request->m_type == RubyRequestType_Load_Linked) || + (request->m_type == RubyRequestType_Store_Conditional) || + (request->m_type == RubyRequestType_Locked_RMW_Read) || + (request->m_type == RubyRequestType_Locked_RMW_Write) || + (request->m_type == RubyRequestType_FLUSH)); + + + // + // For Alpha, properly handle LL, SC, and write requests with respect to + // locked cache blocks. + // + // Not valid for Network_test protocl + // + bool success = true; + if(!m_usingNetworkTester) + success = handleLlsc(address, request); + + if (request->m_type == RubyRequestType_Locked_RMW_Read) { + m_controller->blockOnQueue(address, m_mandatory_q_ptr); + } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { + m_controller->unblock(address); + } + + hitCallback(request, mach, data, success, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::readCallback(Addr address, DataBlock& data) +{ + readCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + + readCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + assert(m_readRequestTable.count(makeLineAddress(address))); + + DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_readRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_LD) || + (request->m_type == RubyRequestType_IFETCH)); + + hitCallback(request, mach, data, true, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(request_address); + + RubyRequestType type = srequest->m_type; + + // Set this cache entry to the most recently used + if (type == RubyRequestType_IFETCH) { + if (m_instCache_ptr->isTagPresent(request_line_address)) + m_instCache_ptr->setMRU(request_line_address); + } else { + if (m_dataCache_ptr->isTagPresent(request_line_address)) + m_dataCache_ptr->setMRU(request_line_address); + } + + recordMissLatency(srequest, mach, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + success, isRegion); + // update the data + // + // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = pkt->getAddr(); + request_line_address = makeLineAddress(pkt->getAddr()); + if (pkt->getPtr<uint8_t>()) { + if ((type == RubyRequestType_LD) || + (type == RubyRequestType_ATOMIC) || + (type == RubyRequestType_ATOMIC_RETURN) || + (type == RubyRequestType_IFETCH) || + (type == RubyRequestType_RMW_Read) || + (type == RubyRequestType_Locked_RMW_Read) || + (type == RubyRequestType_Load_Linked)) { + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + data.setData(pkt->getPtr<uint8_t>(), + getOffset(request_address), pkt->getSize()); + } + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + + + completeHitCallback(mylist, len); +} + +bool +GPUCoalescer::empty() const +{ + return m_writeRequestTable.empty() && m_readRequestTable.empty(); +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +GPUCoalescer::makeRequest(PacketPtr pkt) +{ + // Check for GPU Barrier Kernel End or Kernel Begin + // Leave these to be handled by the child class + // Kernel End/Barrier = isFlush + isRelease + // Kernel Begin = isFlush + isAcquire + if (pkt->req->isKernel()) { + if (pkt->req->isAcquire()){ + // This is a Kernel Begin leave handling to + // virtual xCoalescer::makeRequest + return RequestStatus_Issued; + }else if(pkt->req->isRelease()) { + // This is a Kernel End leave handling to + // virtual xCoalescer::makeRequest + // If we are here then we didn't call + // a virtual version of this function + // so we will also schedule the callback + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } + } + + // If number of outstanding requests greater than the max allowed, + // return RequestStatus_BufferFull. This logic can be extended to + // support proper backpressure. + if (m_outstanding_count >= m_max_outstanding_requests) { + return RequestStatus_BufferFull; + } + + RubyRequestType primary_type = RubyRequestType_NULL; + RubyRequestType secondary_type = RubyRequestType_NULL; + + if (pkt->isLLSC()) { + // + // Alpha LL/SC instructions need to be handled carefully by the cache + // coherence protocol to ensure they follow the proper semantics. In + // particular, by identifying the operations as atomic, the protocol + // should understand that migratory sharing optimizations should not + // be performed (i.e. a load between the LL and SC should not steal + // away exclusive permission). + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Store_Conditional; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Load_Linked; + } + secondary_type = RubyRequestType_ATOMIC; + } else if (pkt->req->isLockedRMW()) { + // + // x86 locked instructions are translated to store cache coherence + // requests because these requests should always be treated as read + // exclusive operations and should leverage any migratory sharing + // optimization built into the protocol. + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Locked_RMW_Write; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Locked_RMW_Read; + } + secondary_type = RubyRequestType_ST; + } else if (pkt->isAtomicOp()) { + // + // GPU Atomic Operation + // + primary_type = RubyRequestType_ATOMIC; + secondary_type = RubyRequestType_ATOMIC; + } else { + if (pkt->isRead()) { + if (pkt->req->isInstFetch()) { + primary_type = secondary_type = RubyRequestType_IFETCH; + } else { +#if THE_ISA == X86_ISA + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & + (TheISA::StoreCheck << TheISA::FlagShift); +#else + bool storeCheck = false; +#endif // X86_ISA + if (storeCheck) { + primary_type = RubyRequestType_RMW_Read; + secondary_type = RubyRequestType_ST; + } else { + primary_type = secondary_type = RubyRequestType_LD; + } + } + } else if (pkt->isWrite()) { + // + // Note: M5 packets do not differentiate ST from RMW_Write + // + primary_type = secondary_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + primary_type = secondary_type = RubyRequestType_FLUSH; + } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { + if (assumingRfOCoherence) { + // If we reached here, this request must be a memFence + // and the protocol implements RfO, the coalescer can + // assume sequentially consistency and schedule the callback + // immediately. + // Currently the code implements fence callbacks + // by reusing the mechanism for kernel completions. + // This should be fixed. + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } else { + // If not RfO, return issued here and let the child coalescer + // take care of it. + return RequestStatus_Issued; + } + } else { + panic("Unsupported ruby packet type\n"); + } + } + + // Check if there is any pending request to this cache line from + // previous cycles. + // If there is a pending request, return aliased. Since coalescing + // across time is not permitted, aliased requests are not coalesced. + // If a request for this address has already been issued, we must block + RequestStatus status = getRequestStatus(pkt, primary_type); + if (status != RequestStatus_Ready) + return status; + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Check if this request can be coalesced with previous + // requests from this cycle. + if (!reqCoalescer.count(line_addr)) { + // This is the first access to this cache line. + // A new request to the memory subsystem has to be + // made in the next cycle for this cache line, so + // add this line addr to the "newRequests" queue + newRequests.push_back(line_addr); + + // There was a request to this cache line in this cycle, + // let us see if we can coalesce this request with the previous + // requests from this cycle + } else if (primary_type != + reqCoalescer[line_addr][0].second[PrimaryType]) { + // can't coalesce loads, stores and atomics! + return RequestStatus_Aliased; + } else if (pkt->req->isLockedRMW() || + reqCoalescer[line_addr][0].first->req->isLockedRMW()) { + // can't coalesce locked accesses, but can coalesce atomics! + return RequestStatus_Aliased; + } else if (pkt->req->hasContextId() && pkt->req->isRelease() && + pkt->req->contextId() != + reqCoalescer[line_addr][0].first->req->contextId()) { + // can't coalesce releases from different wavefronts + return RequestStatus_Aliased; + } + + // in addition to the packet, we need to save both request types + reqCoalescer[line_addr].push_back( + RequestDesc(pkt, std::vector<RubyRequestType>()) ); + reqCoalescer[line_addr].back().second.push_back(primary_type); + reqCoalescer[line_addr].back().second.push_back(secondary_type); + if (!issueEvent.scheduled()) + schedule(issueEvent, curTick()); + // TODO: issue hardware prefetches here + return RequestStatus_Issued; +} + +void +GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) +{ + + int proc_id = -1; + if (pkt != NULL && pkt->req->hasContextId()) { + proc_id = pkt->req->contextId(); + } + + // If valid, copy the pc to the ruby request + Addr pc = 0; + if (pkt->req->hasPC()) { + pc = pkt->req->getPC(); + } + + // At the moment setting scopes only counts + // for GPU spill space accesses + // which is pkt->req->isStack() + // this scope is REPLACE since it + // does not need to be flushed at the end + // of a kernel Private and local may need + // to be visible at the end of the kernel + HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); + HSAScope accessScope = reqScopeToHSAScope(pkt->req); + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Creating WriteMask that records written bytes + // and atomic operations. This enables partial writes + // and partial reads of those writes + DataBlock dataBlock; + dataBlock.clear(); + uint32_t blockSize = RubySystem::getBlockSizeBytes(); + std::vector<bool> accessMask(blockSize,false); + std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; + uint32_t tableSize = reqCoalescer[line_addr].size(); + for (int i = 0; i < tableSize; i++) { + PacketPtr tmpPkt = reqCoalescer[line_addr][i].first; + uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; + uint32_t tmpSize = tmpPkt->getSize(); + if (tmpPkt->isAtomicOp()) { + std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, + tmpPkt->getAtomicOp()); + atomicOps.push_back(tmpAtomicOp); + } else if(tmpPkt->isWrite()) { + dataBlock.setData(tmpPkt->getPtr<uint8_t>(), + tmpOffset, tmpSize); + } + for (int j = 0; j < tmpSize; j++) { + accessMask[tmpOffset + j] = true; + } + } + std::shared_ptr<RubyRequest> msg; + if (pkt->isAtomicOp()) { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, atomicOps, + accessScope, accessSegment); + } else { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, + accessScope, accessSegment); + } + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", + curTick(), m_version, "Coal", "Begin", "", "", + printAddress(msg->getPhysicalAddress()), + RubyRequestType_to_string(secondary_type)); + + fatal_if(secondary_type == RubyRequestType_IFETCH, + "there should not be any I-Fetch requests in the GPU Coalescer"); + + // Send the message to the cache controller + fatal_if(m_data_cache_hit_latency == 0, + "should not have a latency of zero"); + + assert(m_mandatory_q_ptr); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); +} + +template <class KEY, class VALUE> +std::ostream & +operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) +{ + out << "["; + for (auto i = map.begin(); i != map.end(); ++i) + out << " " << i->first << "=" << i->second; + out << " ]"; + + return out; +} + +void +GPUCoalescer::print(ostream& out) const +{ + out << "[GPUCoalescer: " << m_version + << ", outstanding requests: " << m_outstanding_count + << ", read request table: " << m_readRequestTable + << ", write request table: " << m_writeRequestTable + << "]"; +} + +// this can be called from setState whenever coherence permissions are +// upgraded when invoked, coherence violations will be checked for the +// given block +void +GPUCoalescer::checkCoherence(Addr addr) +{ +#ifdef CHECK_COHERENCE + m_ruby_system->checkGlobalCoherenceInvariant(addr); +#endif +} + +void +GPUCoalescer::recordRequestType(SequencerRequestType requestType) { + DPRINTF(RubyStats, "Recorded statistic: %s\n", + SequencerRequestType_to_string(requestType)); +} + +GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq) + : Event(Progress_Event_Pri), seq(_seq) +{ +} + + +void +GPUCoalescer::completeIssue() +{ + // newRequests has the cacheline addresses of all the + // requests which need to be issued to the memory subsystem + // in this cycle + int len = newRequests.size(); + DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); + for (int i = 0; i < len; ++i) { + // Get the requests from reqCoalescer table. Get only the + // first request for each cacheline, the remaining requests + // can be coalesced with the first request. So, only + // one request is issued per cacheline. + RequestDesc info = reqCoalescer[newRequests[i]][0]; + PacketPtr pkt = info.first; + DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", + i, pkt->req->getPaddr()); + // Insert this request to the read/writeRequestTables. These tables + // are used to track aliased requests in makeRequest subroutine + bool found = insertRequest(pkt, info.second[PrimaryType]); + + if (found) { + panic("GPUCoalescer::makeRequest should never be called if the " + "request is already outstanding\n"); + } + + // Issue request to ruby subsystem + issueRequest(pkt, info.second[SecondaryType]); + } + newRequests.clear(); + + // have Kernel End releases been issued this cycle + len = newKernelEnds.size(); + for (int i = 0; i < len; i++) { + kernelCallback(newKernelEnds[i]); + } + newKernelEnds.clear(); +} + +void +GPUCoalescer::IssueEvent::process() +{ + seq->completeIssue(); +} + +const char * +GPUCoalescer::IssueEvent::description() const +{ + return "Issue coalesced request"; +} + +void +GPUCoalescer::evictionCallback(Addr address) +{ + ruby_eviction_callback(address); +} + +void +GPUCoalescer::kernelCallback(int wavefront_id) +{ + assert(kernelEndList.count(wavefront_id)); + + ruby_hit_callback(kernelEndList[wavefront_id]); + + kernelEndList.erase(wavefront_id); +} + +void +GPUCoalescer::atomicCallback(Addr address, + MachineType mach, + const DataBlock& data) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* srequest = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((srequest->m_type == RubyRequestType_ATOMIC) || + (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || + (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); + + + // Atomics don't write to cache, so there is no MRU update... + + recordMissLatency(srequest, mach, + srequest->issue_time, Cycles(0), Cycles(0), true, false); + + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(pkt->getAddr()); + + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(srequest->m_type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = (pkt->getAddr()); + request_line_address = makeLineAddress(request_address); + if (pkt->getPtr<uint8_t>() && + srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { + /* atomics are done in memory, and return the data *before* the atomic op... */ + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(srequest->m_type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + completeHitCallback(mylist, len); +} + +void +GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPLdHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPLdTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCLdHits++; + } else { + CP_LdMiss++; + } +} + +void +GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPStHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPStTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCStHits++; + } else { + CP_StMiss++; + } +} + +void +GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) +{ + for (int i = 0; i < len; ++i) { + RubyPort::SenderState *ss = + safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); + MemSlavePort *port = ss->port; + assert(port != NULL); + + mylist[i]->senderState = ss->predecessor; + delete ss; + port->hitCallback(mylist[i]); + trySendRetries(); + } + + testDrainComplete(); +} + +PacketPtr +GPUCoalescer::mapAddrToPkt(Addr address) +{ + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + return request->pkt; +} + +void +GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion) +{ + RubyRequestType type = srequest->m_type; + Cycles issued_time = srequest->issue_time; + Cycles completion_time = curCycle(); + assert(completion_time >= issued_time); + Cycles total_lat = completion_time - issued_time; + + // cache stats (valid for RfO protocol only) + if (mach == MachineType_TCP) { + if (type == RubyRequestType_LD) { + GPU_TCPLdHits++; + } else { + GPU_TCPStHits++; + } + } else if (mach == MachineType_L1Cache_wCC) { + if (type == RubyRequestType_LD) { + GPU_TCPLdTransfers++; + } else { + GPU_TCPStTransfers++; + } + } else if (mach == MachineType_TCC) { + if (type == RubyRequestType_LD) { + GPU_TCCLdHits++; + } else { + GPU_TCCStHits++; + } + } else { + if (type == RubyRequestType_LD) { + GPU_LdMiss++; + } else { + GPU_StMiss++; + } + } + + // Profile all access latency, even zero latency accesses + m_latencyHist.sample(total_lat); + m_typeLatencyHist[type]->sample(total_lat); + + // Profile the miss latency for all non-zero demand misses + if (total_lat != Cycles(0)) { + m_missLatencyHist.sample(total_lat); + m_missTypeLatencyHist[type]->sample(total_lat); + + if (mach != MachineType_NUM) { + m_missMachLatencyHist[mach]->sample(total_lat); + m_missTypeMachLatencyHist[type][mach]->sample(total_lat); + + if ((issued_time <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completion_time)) { + + m_IssueToInitialDelayHist[mach]->sample( + initialRequestTime - issued_time); + m_InitialToForwardDelayHist[mach]->sample( + forwardRequestTime - initialRequestTime); + m_ForwardToFirstResponseDelayHist[mach]->sample( + firstResponseTime - forwardRequestTime); + m_FirstResponseToCompletionDelayHist[mach]->sample( + completion_time - firstResponseTime); + } + } + + } + + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", + curTick(), m_version, "Coal", + success ? "Done" : "SC_Failed", "", "", + printAddress(srequest->pkt->getAddr()), total_lat); +} + +void +GPUCoalescer::regStats() +{ + // These statistical variables are not for display. + // The profiler will collate these across different + // coalescers and display those collated statistics. + m_outstandReqHist.init(10); + m_latencyHist.init(10); + m_missLatencyHist.init(10); + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist.push_back(new Stats::Histogram()); + m_typeLatencyHist[i]->init(10); + + m_missTypeLatencyHist.push_back(new Stats::Histogram()); + m_missTypeLatencyHist[i]->init(10); + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist.push_back(new Stats::Histogram()); + m_missMachLatencyHist[i]->init(10); + + m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); + m_IssueToInitialDelayHist[i]->init(10); + + m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); + m_InitialToForwardDelayHist[i]->init(10); + + m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); + m_ForwardToFirstResponseDelayHist[i]->init(10); + + m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); + m_FirstResponseToCompletionDelayHist[i]->init(10); + } + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); + + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); + m_missTypeMachLatencyHist[i][j]->init(10); + } + } + + // GPU cache stats + GPU_TCPLdHits + .name(name() + ".gpu_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + GPU_TCPLdTransfers + .name(name() + ".gpu_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + GPU_TCCLdHits + .name(name() + ".gpu_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + GPU_LdMiss + .name(name() + ".gpu_ld_misses") + .desc("loads that miss in the GPU") + ; + + GPU_TCPStHits + .name(name() + ".gpu_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + GPU_TCPStTransfers + .name(name() + ".gpu_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + GPU_TCCStHits + .name(name() + ".gpu_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + GPU_StMiss + .name(name() + ".gpu_st_misses") + .desc("stores that miss in the GPU") + ; + + // CP cache stats + CP_TCPLdHits + .name(name() + ".cp_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + CP_TCPLdTransfers + .name(name() + ".cp_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + CP_TCCLdHits + .name(name() + ".cp_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + CP_LdMiss + .name(name() + ".cp_ld_misses") + .desc("loads that miss in the GPU") + ; + + CP_TCPStHits + .name(name() + ".cp_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + CP_TCPStTransfers + .name(name() + ".cp_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + CP_TCCStHits + .name(name() + ".cp_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + CP_StMiss + .name(name() + ".cp_st_misses") + .desc("stores that miss in the GPU") + ; +} diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh new file mode 100644 index 000000000..dbd47059c --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + +#include <iostream> +#include <unordered_map> + +#include "base/statistics.hh" +#include "mem/protocol/HSAScope.hh" +#include "mem/protocol/HSASegment.hh" +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/protocol/SequencerRequestType.hh" +#include "mem/request.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class RubyGPUCoalescerParams; + +HSAScope reqScopeToHSAScope(Request* req); +HSASegment reqSegmentToHSASegment(Request* req); + +struct GPUCoalescerRequest +{ + PacketPtr pkt; + RubyRequestType m_type; + Cycles issue_time; + + GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type, + Cycles _issue_time) + : pkt(_pkt), m_type(_m_type), issue_time(_issue_time) + {} +}; + +std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj); + +class GPUCoalescer : public RubyPort +{ + public: + typedef RubyGPUCoalescerParams Params; + GPUCoalescer(const Params *); + ~GPUCoalescer(); + + // Public Methods + void wakeup(); // Used only for deadlock detection + + void printProgress(std::ostream& out) const; + void resetStats(); + void collateStats(); + void regStats(); + + void writeCallback(Addr address, DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + /* atomics need their own callback because the data + might be const coming from SLICC */ + void atomicCallback(Addr address, + MachineType mach, + const DataBlock& data); + + void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); + void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); + + // Alternate implementations in VIPER Coalescer + virtual RequestStatus makeRequest(PacketPtr pkt); + + int outstandingCount() const { return m_outstanding_count; } + + bool + isDeadlockEventScheduled() const + { + return deadlockCheckEvent.scheduled(); + } + + void + descheduleDeadlockEvent() + { + deschedule(deadlockCheckEvent); + } + + bool empty() const; + + void print(std::ostream& out) const; + void checkCoherence(Addr address); + + void markRemoved(); + void removeRequest(GPUCoalescerRequest* request); + void evictionCallback(Addr address); + void completeIssue(); + + void insertKernel(int wavefront_id, PacketPtr pkt); + + void recordRequestType(SequencerRequestType requestType); + Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } + + Stats::Histogram& getLatencyHist() { return m_latencyHist; } + Stats::Histogram& getTypeLatencyHist(uint32_t t) + { return *m_typeLatencyHist[t]; } + + Stats::Histogram& getMissLatencyHist() + { return m_missLatencyHist; } + Stats::Histogram& getMissTypeLatencyHist(uint32_t t) + { return *m_missTypeLatencyHist[t]; } + + Stats::Histogram& getMissMachLatencyHist(uint32_t t) const + { return *m_missMachLatencyHist[t]; } + + Stats::Histogram& + getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const + { return *m_missTypeMachLatencyHist[r][t]; } + + Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const + { return *m_IssueToInitialDelayHist[t]; } + + Stats::Histogram& + getInitialToForwardDelayHist(const MachineType t) const + { return *m_InitialToForwardDelayHist[t]; } + + Stats::Histogram& + getForwardRequestToFirstResponseHist(const MachineType t) const + { return *m_ForwardToFirstResponseDelayHist[t]; } + + Stats::Histogram& + getFirstResponseToCompletionDelayHist(const MachineType t) const + { return *m_FirstResponseToCompletionDelayHist[t]; } + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + bool tryCacheAccess(Addr addr, RubyRequestType type, + Addr pc, RubyAccessMode access_mode, + int size, DataBlock*& data_ptr); + // Alternate implementations in VIPER Coalescer + virtual void issueRequest(PacketPtr pkt, RubyRequestType type); + + void kernelCallback(int wavfront_id); + + void hitCallback(GPUCoalescerRequest* request, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + void recordMissLatency(GPUCoalescerRequest* request, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion); + void completeHitCallback(std::vector<PacketPtr> & mylist, int len); + PacketPtr mapAddrToPkt(Addr address); + + + RequestStatus getRequestStatus(PacketPtr pkt, + RubyRequestType request_type); + bool insertRequest(PacketPtr pkt, RubyRequestType request_type); + + bool handleLlsc(Addr address, GPUCoalescerRequest* request); + + // Private copy constructor and assignment operator + GPUCoalescer(const GPUCoalescer& obj); + GPUCoalescer& operator=(const GPUCoalescer& obj); + + class IssueEvent : public Event + { + private: + GPUCoalescer *seq; + public: + IssueEvent(GPUCoalescer *_seq); + void process(); + const char *description() const; + }; + + IssueEvent issueEvent; + + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + int m_max_outstanding_requests; + int m_deadlock_threshold; + + CacheMemory* m_dataCache_ptr; + CacheMemory* m_instCache_ptr; + + // The cache access latency for this GPU data cache. This is assessed at the + // beginning of each access. This should be very similar to the + // implementation in Sequencer() as this is very much like a Sequencer + Cycles m_data_cache_hit_latency; + + // We need to track both the primary and secondary request types. + // The secondary request type comprises a subset of RubyRequestTypes that + // are understood by the L1 Controller. A primary request type can be any + // RubyRequestType. + enum {PrimaryType, SecondaryType}; + typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc; + typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable; + CoalescingTable reqCoalescer; + std::vector<Addr> newRequests; + + typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable; + RequestTable m_writeRequestTable; + RequestTable m_readRequestTable; + // Global outstanding request count, across all request tables + int m_outstanding_count; + bool m_deadlock_check_scheduled; + std::unordered_map<int, PacketPtr> kernelEndList; + std::vector<int> newKernelEnds; + + int m_store_waiting_on_load_cycles; + int m_store_waiting_on_store_cycles; + int m_load_waiting_on_store_cycles; + int m_load_waiting_on_load_cycles; + + bool m_usingNetworkTester; + + class GPUCoalescerWakeupEvent : public Event + { + private: + GPUCoalescer *m_GPUCoalescer_ptr; + + public: + GPUCoalescerWakeupEvent(GPUCoalescer *_seq) : + m_GPUCoalescer_ptr(_seq) {} + void process() { m_GPUCoalescer_ptr->wakeup(); } + const char *description() const + { + return "GPUCoalescer deadlock check"; + } + }; + + GPUCoalescerWakeupEvent deadlockCheckEvent; + bool assumingRfOCoherence; + + // m5 style stats for TCP hit/miss counts + Stats::Scalar GPU_TCPLdHits; + Stats::Scalar GPU_TCPLdTransfers; + Stats::Scalar GPU_TCCLdHits; + Stats::Scalar GPU_LdMiss; + + Stats::Scalar GPU_TCPStHits; + Stats::Scalar GPU_TCPStTransfers; + Stats::Scalar GPU_TCCStHits; + Stats::Scalar GPU_StMiss; + + Stats::Scalar CP_TCPLdHits; + Stats::Scalar CP_TCPLdTransfers; + Stats::Scalar CP_TCCLdHits; + Stats::Scalar CP_LdMiss; + + Stats::Scalar CP_TCPStHits; + Stats::Scalar CP_TCPStTransfers; + Stats::Scalar CP_TCCStHits; + Stats::Scalar CP_StMiss; + + //! Histogram for number of outstanding requests per cycle. + Stats::Histogram m_outstandReqHist; + + //! Histogram for holding latency profile of all requests. + Stats::Histogram m_latencyHist; + std::vector<Stats::Histogram *> m_typeLatencyHist; + + //! Histogram for holding latency profile of all requests that + //! miss in the controller connected to this sequencer. + Stats::Histogram m_missLatencyHist; + std::vector<Stats::Histogram *> m_missTypeLatencyHist; + + //! Histograms for profiling the latencies for requests that + //! required external messages. + std::vector<Stats::Histogram *> m_missMachLatencyHist; + std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist; + + //! Histograms for recording the breakdown of miss latency + std::vector<Stats::Histogram *> m_IssueToInitialDelayHist; + std::vector<Stats::Histogram *> m_InitialToForwardDelayHist; + std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist; + std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist; +}; + +inline std::ostream& +operator<<(std::ostream& out, const GPUCoalescer& obj) +{ + obj.print(out); + out << std::flush; + return out; +} + +#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py new file mode 100644 index 000000000..0c19f875d --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from Sequencer import * + +class RubyGPUCoalescer(RubySequencer): + type = 'RubyGPUCoalescer' + cxx_class = 'GPUCoalescer' + cxx_header = "mem/ruby/system/GPUCoalescer.hh" + + # max_outstanding_requests = (wave front slots) x (wave front size) + max_outstanding_requests = Param.Int(40*64, + "max requests (incl. prefetches) outstanding") + assume_rfo = Param.Bool(True, "assume protocol implementes Read for " + "Ownership coherence"); diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 5a5f528bb..bf4002126 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p) memSlavePort(csprintf("%s-mem-slave-port", name()), this, p->ruby_system->getAccessBackingStore(), -1, p->no_retry_on_stall), - gotAddrRanges(p->port_master_connection_count) + gotAddrRanges(p->port_master_connection_count), + m_isCPUSequencer(p->is_cpu_sequencer) { assert(m_version != -1); diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh index 07e0fde5a..6bd92b654 100644 --- a/src/mem/ruby/system/RubyPort.hh +++ b/src/mem/ruby/system/RubyPort.hh @@ -167,6 +167,8 @@ class RubyPort : public MemObject uint32_t getId() { return m_version; } DrainState drain() override; + bool isCPUSequencer() { return m_isCPUSequencer; } + protected: void trySendRetries(); void ruby_hit_callback(PacketPtr pkt); @@ -218,6 +220,8 @@ class RubyPort : public MemObject // that should be called when the Sequencer becomes available after a stall. // std::vector<MemSlavePort *> retryList; + + bool m_isCPUSequencer; }; #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__ diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index 1ecd2e098..e1717e519 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, Sequencer* sequencer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { - sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer()); + sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript index 8c5077362..b67311bca 100644 --- a/src/mem/ruby/system/SConscript +++ b/src/mem/ruby/system/SConscript @@ -33,12 +33,22 @@ Import('*') if env['PROTOCOL'] == 'None': Return() +if env['BUILD_GPU']: + SimObject('GPUCoalescer.py') SimObject('RubySystem.py') SimObject('Sequencer.py') +SimObject('WeightedLRUReplacementPolicy.py') +if env['BUILD_GPU']: + SimObject('VIPERCoalescer.py') Source('CacheRecorder.cc') Source('DMASequencer.cc') +if env['BUILD_GPU']: + Source('GPUCoalescer.cc') Source('RubyPort.cc') Source('RubyPortProxy.cc') Source('RubySystem.cc') Source('Sequencer.cc') +if env['BUILD_GPU']: + Source('VIPERCoalescer.cc') +Source('WeightedLRUPolicy.cc') diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 50418c700..c2727b41d 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p) m_max_outstanding_requests = p->max_outstanding_requests; m_deadlock_threshold = p->deadlock_threshold; + m_coreId = p->coreid; // for tracking the two CorePair sequencers assert(m_max_outstanding_requests > 0); assert(m_deadlock_threshold > 0); assert(m_instCache_ptr != NULL); @@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) ContextID proc_id = pkt->req->hasContextId() ? pkt->req->contextId() : InvalidContextID; + ContextID core_id = coreId(); + // If valid, copy the pc to the ruby request Addr pc = 0; if (pkt->req->hasPC()) { @@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) nullptr : pkt->getPtr<uint8_t>(), pkt->getSize(), pc, secondary_type, RubyAccessMode_Supervisor, pkt, - PrefetchBit_No, proc_id); + PrefetchBit_No, proc_id, core_id); DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n", curTick(), m_version, "Seq", "Begin", "", "", diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index 47af7ea1e..2a2f49587 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -99,6 +99,7 @@ class Sequencer : public RubyPort void markRemoved(); void evictionCallback(Addr address); void invalidateSC(Addr address); + int coreId() const { return m_coreId; } void recordRequestType(SequencerRequestType requestType); Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } @@ -198,6 +199,8 @@ class Sequencer : public RubyPort Stats::Scalar m_load_waiting_on_store; Stats::Scalar m_load_waiting_on_load; + int m_coreId; + bool m_usingNetworkTester; //! Histogram for number of outstanding requests per cycle. diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py index 7c90eb29c..d6ee0aa2f 100644 --- a/src/mem/ruby/system/Sequencer.py +++ b/src/mem/ruby/system/Sequencer.py @@ -32,54 +32,58 @@ from m5.proxy import * from MemObject import MemObject class RubyPort(MemObject): - type = 'RubyPort' - abstract = True - cxx_header = "mem/ruby/system/RubyPort.hh" - version = Param.Int(0, "") + type = 'RubyPort' + abstract = True + cxx_header = "mem/ruby/system/RubyPort.hh" + version = Param.Int(0, "") - slave = VectorSlavePort("CPU slave port") - master = VectorMasterPort("CPU master port") - pio_master_port = MasterPort("Ruby mem master port") - mem_master_port = MasterPort("Ruby mem master port") - pio_slave_port = SlavePort("Ruby pio slave port") - mem_slave_port = SlavePort("Ruby memory port") + slave = VectorSlavePort("CPU slave port") + master = VectorMasterPort("CPU master port") + pio_master_port = MasterPort("Ruby mem master port") + mem_master_port = MasterPort("Ruby mem master port") + pio_slave_port = SlavePort("Ruby pio slave port") + mem_slave_port = SlavePort("Ruby memory port") - using_ruby_tester = Param.Bool(False, "") - no_retry_on_stall = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") - support_data_reqs = Param.Bool(True, "data cache requests supported") - support_inst_reqs = Param.Bool(True, "inst cache requests supported") + using_ruby_tester = Param.Bool(False, "") + no_retry_on_stall = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") + support_data_reqs = Param.Bool(True, "data cache requests supported") + support_inst_reqs = Param.Bool(True, "inst cache requests supported") + is_cpu_sequencer = Param.Bool(True, "connected to a cpu") class RubyPortProxy(RubyPort): - type = 'RubyPortProxy' - cxx_header = "mem/ruby/system/RubyPortProxy.hh" + type = 'RubyPortProxy' + cxx_header = "mem/ruby/system/RubyPortProxy.hh" class RubySequencer(RubyPort): - type = 'RubySequencer' - cxx_class = 'Sequencer' - cxx_header = "mem/ruby/system/Sequencer.hh" + type = 'RubySequencer' + cxx_class = 'Sequencer' + cxx_header = "mem/ruby/system/Sequencer.hh" - icache = Param.RubyCache("") - dcache = Param.RubyCache("") - # Cache latencies currently assessed at the beginning of each access - # NOTE: Setting these values to a value greater than one will result in - # O3 CPU pipeline bubbles and negatively impact performance - # TODO: Latencies should be migrated into each top-level cache controller - icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") - dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") - max_outstanding_requests = Param.Int(16, - "max requests (incl. prefetches) outstanding") - deadlock_threshold = Param.Cycles(500000, - "max outstanding cycles for a request before deadlock/livelock declared") - using_network_tester = Param.Bool(False, "") + icache = Param.RubyCache("") + dcache = Param.RubyCache("") + # Cache latencies currently assessed at the beginning of each access + # NOTE: Setting these values to a value greater than one will result in + # O3 CPU pipeline bubbles and negatively impact performance + # TODO: Latencies should be migrated into each top-level cache controller + icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") + dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") + max_outstanding_requests = Param.Int(16, + "max requests (incl. prefetches) outstanding") + deadlock_threshold = Param.Cycles(500000, + "max outstanding cycles for a request before deadlock/livelock declared") + using_network_tester = Param.Bool(False, "") + # id used by protocols that support multiple sequencers per controller + # 99 is the dummy default value + coreid = Param.Int(99, "CorePair core id") class DMASequencer(MemObject): - type = 'DMASequencer' - cxx_header = "mem/ruby/system/DMASequencer.hh" + type = 'DMASequencer' + cxx_header = "mem/ruby/system/DMASequencer.hh" - version = Param.Int(0, "") - slave = SlavePort("Device slave port") - using_ruby_tester = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") + version = Param.Int(0, "") + slave = SlavePort("Device slave port") + using_ruby_tester = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc new file mode 100644 index 000000000..ca91f2723 --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/VIPERCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/VIPERCoalescer.hh" + +using namespace std; + +VIPERCoalescer * +VIPERCoalescerParams::create() +{ + return new VIPERCoalescer(this); +} + +VIPERCoalescer::VIPERCoalescer(const Params *p) + : GPUCoalescer(p) +{ + m_max_wb_per_cycle=p->max_wb_per_cycle; + m_max_inv_per_cycle=p->max_inv_per_cycle; + m_outstanding_inv = 0; + m_outstanding_wb = 0; +} + +VIPERCoalescer::~VIPERCoalescer() +{ +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +VIPERCoalescer::makeRequest(PacketPtr pkt) +{ + if (m_outstanding_wb | m_outstanding_inv) { + DPRINTF(GPUCoalescer, + "There are %d Writebacks and %d Invalidatons\n", + m_outstanding_wb, m_outstanding_inv); + } + // Are we in the middle of a release + if ((m_outstanding_wb) > 0) { + if (pkt->req->isKernel()) { + // Everythign is fine + // Barriers and Kernel End scan coalesce + // If it is a Kerenl Begin flush the cache + if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) { + invL1(); + } + + if (pkt->req->isRelease()) { + insertKernel(pkt->req->contextId(), pkt); + } + + return RequestStatus_Issued; + } +// return RequestStatus_Aliased; + } else if (pkt->req->isKernel() && pkt->req->isRelease()) { + // Flush Dirty Data on Kernel End + // isKernel + isRelease + insertKernel(pkt->req->contextId(), pkt); + wbL1(); + if(m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + return RequestStatus_Issued; + } + RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt); + if (requestStatus!=RequestStatus_Issued) { + // Request not isssued + // enqueue Retry + DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n"); + return requestStatus; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + // Invalidate clean Data on Kernel Begin + // isKernel + isAcquire + invL1(); + } else if (pkt->req->isAcquire() && pkt->req->isRelease()) { + // Deschedule the AtomicAcqRel and + // Flush and Invalidate the L1 cache + invwbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isRelease()) { + // Deschedule the StoreRel and + // Flush the L1 cache + wbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isAcquire()) { + // LoadAcq or AtomicAcq + // Invalidate the L1 cache + invL1(); + } + // Request was successful + if (m_outstanding_wb == 0) { + if (!issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n"); + schedule(issueEvent, curTick()); + } + } + return RequestStatus_Issued; +} + +void +VIPERCoalescer::wbCallback(Addr addr) +{ + m_outstanding_wb--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + assert(((int) m_outstanding_wb) >= 0); + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +void +VIPERCoalescer::invCallback(Addr addr) +{ + m_outstanding_inv--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + // This probably won't happen, since + // we dont wait on cache invalidations + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +/** + * Invalidate L1 cache (Acquire) + */ +void +VIPERCoalescer::invL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Invalidations outstanding before Cache Walk\n", + m_outstanding_inv); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + DPRINTF(GPUCoalescer, + "There are %d Invalidatons outstanding after Cache Walk\n", + m_outstanding_inv); +} + +/** + * Writeback L1 cache (Release) + */ +void +VIPERCoalescer::wbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding before Cache Walk\n", + m_outstanding_wb); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding after Cache Walk\n", + m_outstanding_wb); +} + +/** + * Invalidate and Writeback L1 cache (Acquire&Release) + */ +void +VIPERCoalescer::invwbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + // Walk the cache + for(int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + // Walk the cache + for(int i = 0; i< size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } +} diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh new file mode 100644 index 000000000..af6e44e7f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + +#include <iostream> + +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class VIPERCoalescerParams; + +class VIPERCoalescer : public GPUCoalescer +{ + public: + typedef VIPERCoalescerParams Params; + VIPERCoalescer(const Params *); + ~VIPERCoalescer(); + void wbCallback(Addr address); + void invCallback(Addr address); + RequestStatus makeRequest(PacketPtr pkt); + private: + void invL1(); + void wbL1(); + void invwbL1(); + uint64_t m_outstanding_inv; + uint64_t m_outstanding_wb; + uint64_t m_max_inv_per_cycle; + uint64_t m_max_wb_per_cycle; +}; +#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py new file mode 100644 index 000000000..05c74386f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.py @@ -0,0 +1,45 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from GPUCoalescer import * + +class VIPERCoalescer(RubyGPUCoalescer): + type = 'VIPERCoalescer' + cxx_class = 'VIPERCoalescer' + cxx_header = "mem/ruby/system/VIPERCoalescer.hh" + max_inv_per_cycle = Param.Int(32, "max invalidations per cycle") + max_wb_per_cycle = Param.Int(32, "max writebacks per cycle") + assume_rfo = False diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc new file mode 100644 index 000000000..5baa4d9a5 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.cc @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Derek Hower + */ + +#include "mem/ruby/system/WeightedLRUPolicy.hh" + +WeightedLRUPolicy::WeightedLRUPolicy(const Params* p) + : AbstractReplacementPolicy(p), m_cache(p->cache) +{ + m_last_occ_ptr = new int*[m_num_sets]; + for(unsigned i = 0; i < m_num_sets; i++){ + m_last_occ_ptr[i] = new int[m_assoc]; + for(unsigned j = 0; j < m_assoc; j++){ + m_last_occ_ptr[i][j] = 0; + } + } +} + +WeightedLRUPolicy * +WeightedLRUReplacementPolicyParams::create() +{ + return new WeightedLRUPolicy(this); +} + +WeightedLRUPolicy::~WeightedLRUPolicy() +{ + if (m_last_occ_ptr != NULL){ + for (unsigned i = 0; i < m_num_sets; i++){ + if (m_last_occ_ptr[i] != NULL){ + delete[] m_last_occ_ptr[i]; + } + } + delete[] m_last_occ_ptr; + } +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; + m_last_occ_ptr[set][index] = occupancy; +} + +int64_t +WeightedLRUPolicy::getVictim(int64_t set) const +{ + Tick time, smallest_time; + int64_t smallest_index; + + smallest_index = 0; + smallest_time = m_last_ref_ptr[set][0]; + int smallest_weight = m_last_ref_ptr[set][0]; + + for (unsigned i = 1; i < m_assoc; i++) { + + int weight = m_last_occ_ptr[set][i]; + if (weight < smallest_weight) { + smallest_weight = weight; + smallest_index = i; + smallest_time = m_last_ref_ptr[set][i]; + } else if (weight == smallest_weight) { + time = m_last_ref_ptr[set][i]; + if (time < smallest_time) { + smallest_index = i; + smallest_time = time; + } + } + } + return smallest_index; +} diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh new file mode 100644 index 000000000..3150779b2 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.hh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ +#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ + +#include "mem/ruby/structures/AbstractReplacementPolicy.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "params/WeightedLRUReplacementPolicy.hh" + +/* Simple true LRU replacement policy */ + +class WeightedLRUPolicy : public AbstractReplacementPolicy +{ + public: + typedef WeightedLRUReplacementPolicyParams Params; + WeightedLRUPolicy(const Params* p); + ~WeightedLRUPolicy(); + + void touch(int64_t set, int64_t way, Tick time); + void touch(int64_t set, int64_t way, Tick time, int occupancy); + int64_t getVictim(int64_t set) const override; + + bool useOccupancy() const { return true; } + + CacheMemory * m_cache; + int **m_last_occ_ptr; +}; + +#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__ diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py new file mode 100644 index 000000000..e7de33496 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2013-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Derek Hower +# + +from m5.params import * +from m5.proxy import * +from MemObject import MemObject +from ReplacementPolicy import ReplacementPolicy + +class WeightedLRUReplacementPolicy(ReplacementPolicy): + type = "WeightedLRUReplacementPolicy" + cxx_class = "WeightedLRUPolicy" + cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh" + cache = Param.RubyCache("") diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py index a530307ee..fc3f32c3d 100644 --- a/src/mem/slicc/symbols/StateMachine.py +++ b/src/mem/slicc/symbols/StateMachine.py @@ -35,13 +35,17 @@ import re python_class_map = { "int": "Int", + "NodeID": "Int", "uint32_t" : "UInt32", "std::string": "String", "bool": "Bool", "CacheMemory": "RubyCache", "WireBuffer": "RubyWireBuffer", "Sequencer": "RubySequencer", + "GPUCoalescer" : "RubyGPUCoalescer", + "VIPERCoalescer" : "VIPERCoalescer", "DirectoryMemory": "RubyDirectoryMemory", + "PerfectCacheMemory": "RubyPerfectCacheMemory", "MemoryControl": "MemoryControl", "MessageBuffer": "MessageBuffer", "DMASequencer": "DMASequencer", @@ -305,7 +309,7 @@ class $c_ident : public AbstractController void collateStats(); void recordCacheTrace(int cntrl, CacheRecorder* tr); - Sequencer* getSequencer() const; + Sequencer* getCPUSequencer() const; int functionalWriteBuffers(PacketPtr&); @@ -527,8 +531,14 @@ $c_ident::$c_ident(const Params *p) else: code('m_${{param.ident}} = p->${{param.ident}};') - if re.compile("sequencer").search(param.ident): - code('m_${{param.ident}}_ptr->setController(this);') + if re.compile("sequencer").search(param.ident) or \ + param.type_ast.type.c_ident == "GPUCoalescer" or \ + param.type_ast.type.c_ident == "VIPERCoalescer": + code(''' +if (m_${{param.ident}}_ptr != NULL) { + m_${{param.ident}}_ptr->setController(this); +} +''') code(''' @@ -670,6 +680,28 @@ $c_ident::init() assert(param.pointer) seq_ident = "m_%s_ptr" % param.ident + if seq_ident != "NULL": + code(''' +Sequencer* +$c_ident::getCPUSequencer() const +{ + if (NULL != $seq_ident && $seq_ident->isCPUSequencer()) { + return $seq_ident; + } else { + return NULL; + } +} +''') + else: + code(''' + +Sequencer* +$c_ident::getCPUSequencer() const +{ + return NULL; +} +''') + code(''' void @@ -796,12 +828,6 @@ $c_ident::getMemoryQueue() const return $memq_ident; } -Sequencer* -$c_ident::getSequencer() const -{ - return $seq_ident; -} - void $c_ident::print(ostream& out) const { |